1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "Utils/AMDGPUBaseInfo.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 18 #define DEBUG_TYPE "si-shrink-instructions" 19 20 STATISTIC(NumInstructionsShrunk, 21 "Number of 64-bit instruction reduced to 32-bit."); 22 STATISTIC(NumLiteralConstantsFolded, 23 "Number of literal constants folded into 32-bit instructions."); 24 25 using namespace llvm; 26 27 namespace { 28 29 class SIShrinkInstructions : public MachineFunctionPass { 30 MachineFunction *MF; 31 MachineRegisterInfo *MRI; 32 const GCNSubtarget *ST; 33 const SIInstrInfo *TII; 34 const SIRegisterInfo *TRI; 35 36 public: 37 static char ID; 38 39 public: 40 SIShrinkInstructions() : MachineFunctionPass(ID) { 41 } 42 43 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 44 bool shouldShrinkTrue16(MachineInstr &MI) const; 45 bool isKImmOperand(const MachineOperand &Src) const; 46 bool isKUImmOperand(const MachineOperand &Src) const; 47 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 48 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 49 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 50 void shrinkScalarCompare(MachineInstr &MI) const; 51 void shrinkMIMG(MachineInstr &MI) const; 52 void shrinkMadFma(MachineInstr &MI) const; 53 bool shrinkScalarLogicOp(MachineInstr &MI) const; 54 bool tryReplaceDeadSDST(MachineInstr &MI) const; 55 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 56 Register Reg, unsigned SubReg) const; 57 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 58 unsigned SubReg) const; 59 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 60 unsigned SubReg) const; 61 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 62 unsigned I) const; 63 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 64 MachineInstr *matchSwap(MachineInstr &MovT) const; 65 66 bool runOnMachineFunction(MachineFunction &MF) override; 67 68 StringRef getPassName() const override { return "SI Shrink Instructions"; } 69 70 void getAnalysisUsage(AnalysisUsage &AU) const override { 71 AU.setPreservesCFG(); 72 MachineFunctionPass::getAnalysisUsage(AU); 73 } 74 }; 75 76 } // End anonymous namespace. 77 78 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 79 "SI Shrink Instructions", false, false) 80 81 char SIShrinkInstructions::ID = 0; 82 83 FunctionPass *llvm::createSIShrinkInstructionsPass() { 84 return new SIShrinkInstructions(); 85 } 86 87 /// This function checks \p MI for operands defined by a move immediate 88 /// instruction and then folds the literal constant into the instruction if it 89 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 90 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 91 bool TryToCommute) const { 92 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 93 94 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 95 96 // Try to fold Src0 97 MachineOperand &Src0 = MI.getOperand(Src0Idx); 98 if (Src0.isReg()) { 99 Register Reg = Src0.getReg(); 100 if (Reg.isVirtual()) { 101 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 102 if (Def && Def->isMoveImmediate()) { 103 MachineOperand &MovSrc = Def->getOperand(1); 104 bool ConstantFolded = false; 105 106 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 107 if (MovSrc.isImm() && 108 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 109 Src0.ChangeToImmediate(MovSrc.getImm()); 110 ConstantFolded = true; 111 } else if (MovSrc.isFI()) { 112 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 113 ConstantFolded = true; 114 } else if (MovSrc.isGlobal()) { 115 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 116 MovSrc.getTargetFlags()); 117 ConstantFolded = true; 118 } 119 } 120 121 if (ConstantFolded) { 122 if (MRI->use_nodbg_empty(Reg)) 123 Def->eraseFromParent(); 124 ++NumLiteralConstantsFolded; 125 return true; 126 } 127 } 128 } 129 } 130 131 // We have failed to fold src0, so commute the instruction and try again. 132 if (TryToCommute && MI.isCommutable()) { 133 if (TII->commuteInstruction(MI)) { 134 if (foldImmediates(MI, false)) 135 return true; 136 137 // Commute back. 138 TII->commuteInstruction(MI); 139 } 140 } 141 142 return false; 143 } 144 145 /// Do not shrink the instruction if its registers are not expressible in the 146 /// shrunk encoding. 147 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 148 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 149 const MachineOperand &MO = MI.getOperand(I); 150 if (MO.isReg()) { 151 Register Reg = MO.getReg(); 152 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 153 "True16 Instructions post-RA"); 154 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 155 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 156 return false; 157 } 158 } 159 return true; 160 } 161 162 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 163 return isInt<16>(Src.getImm()) && 164 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 165 } 166 167 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 168 return isUInt<16>(Src.getImm()) && 169 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 170 } 171 172 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 173 bool &IsUnsigned) const { 174 if (isInt<16>(Src.getImm())) { 175 IsUnsigned = false; 176 return !TII->isInlineConstant(Src); 177 } 178 179 if (isUInt<16>(Src.getImm())) { 180 IsUnsigned = true; 181 return !TII->isInlineConstant(Src); 182 } 183 184 return false; 185 } 186 187 /// \returns true if the constant in \p Src should be replaced with a bitreverse 188 /// of an inline immediate. 189 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 190 int32_t &ReverseImm) const { 191 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 192 return false; 193 194 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 195 return ReverseImm >= -16 && ReverseImm <= 64; 196 } 197 198 /// Copy implicit register operands from specified instruction to this 199 /// instruction that are not part of the instruction definition. 200 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 201 MachineInstr &MI) const { 202 MachineFunction &MF = *MI.getMF(); 203 for (unsigned i = MI.getDesc().getNumOperands() + 204 MI.getDesc().implicit_uses().size() + 205 MI.getDesc().implicit_defs().size(), 206 e = MI.getNumOperands(); 207 i != e; ++i) { 208 const MachineOperand &MO = MI.getOperand(i); 209 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 210 NewMI.addOperand(MF, MO); 211 } 212 } 213 214 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 215 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 216 // get constants on the RHS. 217 if (!MI.getOperand(0).isReg()) 218 TII->commuteInstruction(MI, false, 0, 1); 219 220 // cmpk requires src0 to be a register 221 const MachineOperand &Src0 = MI.getOperand(0); 222 if (!Src0.isReg()) 223 return; 224 225 const MachineOperand &Src1 = MI.getOperand(1); 226 if (!Src1.isImm()) 227 return; 228 229 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 230 if (SOPKOpc == -1) 231 return; 232 233 // eq/ne is special because the imm16 can be treated as signed or unsigned, 234 // and initially selected to the unsigned versions. 235 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 236 bool HasUImm; 237 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 238 if (!HasUImm) { 239 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 240 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 241 } 242 243 MI.setDesc(TII->get(SOPKOpc)); 244 } 245 246 return; 247 } 248 249 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 250 251 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 252 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 253 MI.setDesc(NewDesc); 254 } 255 } 256 257 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 258 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 259 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 260 if (!Info) 261 return; 262 263 uint8_t NewEncoding; 264 switch (Info->MIMGEncoding) { 265 case AMDGPU::MIMGEncGfx10NSA: 266 NewEncoding = AMDGPU::MIMGEncGfx10Default; 267 break; 268 case AMDGPU::MIMGEncGfx11NSA: 269 NewEncoding = AMDGPU::MIMGEncGfx11Default; 270 break; 271 default: 272 return; 273 } 274 275 int VAddr0Idx = 276 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 277 unsigned NewAddrDwords = Info->VAddrDwords; 278 const TargetRegisterClass *RC; 279 280 if (Info->VAddrDwords == 2) { 281 RC = &AMDGPU::VReg_64RegClass; 282 } else if (Info->VAddrDwords == 3) { 283 RC = &AMDGPU::VReg_96RegClass; 284 } else if (Info->VAddrDwords == 4) { 285 RC = &AMDGPU::VReg_128RegClass; 286 } else if (Info->VAddrDwords == 5) { 287 RC = &AMDGPU::VReg_160RegClass; 288 } else if (Info->VAddrDwords == 6) { 289 RC = &AMDGPU::VReg_192RegClass; 290 } else if (Info->VAddrDwords == 7) { 291 RC = &AMDGPU::VReg_224RegClass; 292 } else if (Info->VAddrDwords == 8) { 293 RC = &AMDGPU::VReg_256RegClass; 294 } else if (Info->VAddrDwords == 9) { 295 RC = &AMDGPU::VReg_288RegClass; 296 } else if (Info->VAddrDwords == 10) { 297 RC = &AMDGPU::VReg_320RegClass; 298 } else if (Info->VAddrDwords == 11) { 299 RC = &AMDGPU::VReg_352RegClass; 300 } else if (Info->VAddrDwords == 12) { 301 RC = &AMDGPU::VReg_384RegClass; 302 } else { 303 RC = &AMDGPU::VReg_512RegClass; 304 NewAddrDwords = 16; 305 } 306 307 unsigned VgprBase = 0; 308 unsigned NextVgpr = 0; 309 bool IsUndef = true; 310 bool IsKill = NewAddrDwords == Info->VAddrDwords; 311 const unsigned NSAMaxSize = ST->getNSAMaxSize(); 312 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; 313 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; 314 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { 315 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 316 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 317 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 318 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 319 320 if (Idx == 0) { 321 VgprBase = Vgpr; 322 NextVgpr = Vgpr + Dwords; 323 } else if (Vgpr == NextVgpr) { 324 NextVgpr = Vgpr + Dwords; 325 } else { 326 return; 327 } 328 329 if (!Op.isUndef()) 330 IsUndef = false; 331 if (!Op.isKill()) 332 IsKill = false; 333 } 334 335 if (VgprBase + NewAddrDwords > 256) 336 return; 337 338 // Further check for implicit tied operands - this may be present if TFE is 339 // enabled 340 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 341 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 342 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 343 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 344 int ToUntie = -1; 345 if (TFEVal || LWEVal) { 346 // TFE/LWE is enabled so we need to deal with an implicit tied operand 347 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 348 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 349 MI.getOperand(i).isImplicit()) { 350 // This is the tied operand 351 assert( 352 ToUntie == -1 && 353 "found more than one tied implicit operand when expecting only 1"); 354 ToUntie = i; 355 MI.untieRegOperand(ToUntie); 356 } 357 } 358 } 359 360 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 361 Info->VDataDwords, NewAddrDwords); 362 MI.setDesc(TII->get(NewOpcode)); 363 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 364 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 365 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 366 367 for (unsigned i = 1; i < EndVAddr; ++i) 368 MI.removeOperand(VAddr0Idx + 1); 369 370 if (ToUntie >= 0) { 371 MI.tieOperands( 372 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 373 ToUntie - (EndVAddr - 1)); 374 } 375 } 376 377 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 378 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 379 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 380 // there is no reason to try to shrink them. 381 if (!ST->hasVOP3Literal()) 382 return; 383 384 // There is no advantage to doing this pre-RA. 385 if (!MF->getProperties().hasProperty( 386 MachineFunctionProperties::Property::NoVRegs)) 387 return; 388 389 if (TII->hasAnyModifiersSet(MI)) 390 return; 391 392 const unsigned Opcode = MI.getOpcode(); 393 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 394 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 395 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 396 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 397 398 bool Swap; 399 400 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 401 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 402 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 403 Swap = false; 404 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 405 Swap = true; 406 else 407 return; 408 409 switch (Opcode) { 410 default: 411 llvm_unreachable("Unexpected mad/fma opcode!"); 412 case AMDGPU::V_MAD_F32_e64: 413 NewOpcode = AMDGPU::V_MADAK_F32; 414 break; 415 case AMDGPU::V_FMA_F32_e64: 416 NewOpcode = AMDGPU::V_FMAAK_F32; 417 break; 418 case AMDGPU::V_MAD_F16_e64: 419 NewOpcode = AMDGPU::V_MADAK_F16; 420 break; 421 case AMDGPU::V_FMA_F16_e64: 422 case AMDGPU::V_FMA_F16_gfx9_e64: 423 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 424 : AMDGPU::V_FMAAK_F16; 425 break; 426 } 427 } 428 429 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 430 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 431 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 432 Swap = false; 433 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 434 Swap = true; 435 else 436 return; 437 438 switch (Opcode) { 439 default: 440 llvm_unreachable("Unexpected mad/fma opcode!"); 441 case AMDGPU::V_MAD_F32_e64: 442 NewOpcode = AMDGPU::V_MADMK_F32; 443 break; 444 case AMDGPU::V_FMA_F32_e64: 445 NewOpcode = AMDGPU::V_FMAMK_F32; 446 break; 447 case AMDGPU::V_MAD_F16_e64: 448 NewOpcode = AMDGPU::V_MADMK_F16; 449 break; 450 case AMDGPU::V_FMA_F16_e64: 451 case AMDGPU::V_FMA_F16_gfx9_e64: 452 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 453 : AMDGPU::V_FMAMK_F16; 454 break; 455 } 456 } 457 458 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 459 return; 460 461 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 462 return; 463 464 if (Swap) { 465 // Swap Src0 and Src1 by building a new instruction. 466 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 467 MI.getOperand(0).getReg()) 468 .add(Src1) 469 .add(Src0) 470 .add(Src2) 471 .setMIFlags(MI.getFlags()); 472 MI.eraseFromParent(); 473 } else { 474 TII->removeModOperands(MI); 475 MI.setDesc(TII->get(NewOpcode)); 476 } 477 } 478 479 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. 480 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 481 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 482 /// XNOR (as a ^ b == ~(a ^ ~b)). 483 /// \returns true if the caller should continue the machine function iterator 484 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 485 unsigned Opc = MI.getOpcode(); 486 const MachineOperand *Dest = &MI.getOperand(0); 487 MachineOperand *Src0 = &MI.getOperand(1); 488 MachineOperand *Src1 = &MI.getOperand(2); 489 MachineOperand *SrcReg = Src0; 490 MachineOperand *SrcImm = Src1; 491 492 if (!SrcImm->isImm() || 493 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 494 return false; 495 496 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 497 uint32_t NewImm = 0; 498 499 if (Opc == AMDGPU::S_AND_B32) { 500 if (isPowerOf2_32(~Imm)) { 501 NewImm = llvm::countr_one(Imm); 502 Opc = AMDGPU::S_BITSET0_B32; 503 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 504 NewImm = ~Imm; 505 Opc = AMDGPU::S_ANDN2_B32; 506 } 507 } else if (Opc == AMDGPU::S_OR_B32) { 508 if (isPowerOf2_32(Imm)) { 509 NewImm = llvm::countr_zero(Imm); 510 Opc = AMDGPU::S_BITSET1_B32; 511 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 512 NewImm = ~Imm; 513 Opc = AMDGPU::S_ORN2_B32; 514 } 515 } else if (Opc == AMDGPU::S_XOR_B32) { 516 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 517 NewImm = ~Imm; 518 Opc = AMDGPU::S_XNOR_B32; 519 } 520 } else { 521 llvm_unreachable("unexpected opcode"); 522 } 523 524 if (NewImm != 0) { 525 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 526 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 527 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 528 return true; 529 } 530 531 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 532 const bool IsUndef = SrcReg->isUndef(); 533 const bool IsKill = SrcReg->isKill(); 534 MI.setDesc(TII->get(Opc)); 535 if (Opc == AMDGPU::S_BITSET0_B32 || 536 Opc == AMDGPU::S_BITSET1_B32) { 537 Src0->ChangeToImmediate(NewImm); 538 // Remove the immediate and add the tied input. 539 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 540 /*isImp*/ false, IsKill, 541 /*isDead*/ false, IsUndef); 542 MI.tieOperands(0, 2); 543 } else { 544 SrcImm->setImm(NewImm); 545 } 546 } 547 } 548 549 return false; 550 } 551 552 // This is the same as MachineInstr::readsRegister/modifiesRegister except 553 // it takes subregs into account. 554 bool SIShrinkInstructions::instAccessReg( 555 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 556 unsigned SubReg) const { 557 for (const MachineOperand &MO : R) { 558 if (!MO.isReg()) 559 continue; 560 561 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 562 if (TRI->regsOverlap(Reg, MO.getReg())) 563 return true; 564 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 565 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 566 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 567 if (Overlap.any()) 568 return true; 569 } 570 } 571 return false; 572 } 573 574 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 575 unsigned SubReg) const { 576 return instAccessReg(MI->uses(), Reg, SubReg); 577 } 578 579 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 580 unsigned SubReg) const { 581 return instAccessReg(MI->defs(), Reg, SubReg); 582 } 583 584 TargetInstrInfo::RegSubRegPair 585 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 586 unsigned I) const { 587 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 588 if (Reg.isPhysical()) { 589 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 590 } else { 591 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 592 } 593 } 594 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 595 } 596 597 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 598 MachineInstr &MI) const { 599 for (unsigned i = MI.getDesc().getNumOperands() + 600 MI.getDesc().implicit_uses().size() + 601 MI.getDesc().implicit_defs().size(), 602 e = MI.getNumOperands(); 603 i != e; ++i) { 604 const MachineOperand &Op = MI.getOperand(i); 605 if (!Op.isDef()) 606 continue; 607 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 608 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 609 } 610 611 MI.eraseFromParent(); 612 } 613 614 // Match: 615 // mov t, x 616 // mov x, y 617 // mov y, t 618 // 619 // => 620 // 621 // mov t, x (t is potentially dead and move eliminated) 622 // v_swap_b32 x, y 623 // 624 // Returns next valid instruction pointer if was able to create v_swap_b32. 625 // 626 // This shall not be done too early not to prevent possible folding which may 627 // remove matched moves, and this should preferably be done before RA to 628 // release saved registers and also possibly after RA which can insert copies 629 // too. 630 // 631 // This is really just a generic peephole that is not a canonical shrinking, 632 // although requirements match the pass placement and it reduces code size too. 633 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 634 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 635 MovT.getOpcode() == AMDGPU::COPY); 636 637 Register T = MovT.getOperand(0).getReg(); 638 unsigned Tsub = MovT.getOperand(0).getSubReg(); 639 MachineOperand &Xop = MovT.getOperand(1); 640 641 if (!Xop.isReg()) 642 return nullptr; 643 Register X = Xop.getReg(); 644 unsigned Xsub = Xop.getSubReg(); 645 646 unsigned Size = TII->getOpSize(MovT, 0) / 4; 647 648 if (!TRI->isVGPR(*MRI, X)) 649 return nullptr; 650 651 const unsigned SearchLimit = 16; 652 unsigned Count = 0; 653 bool KilledT = false; 654 for (auto Iter = std::next(MovT.getIterator()), 655 E = MovT.getParent()->instr_end(); 656 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 657 658 MachineInstr *MovY = &*Iter; 659 KilledT = MovY->killsRegister(T, TRI); 660 661 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 662 MovY->getOpcode() != AMDGPU::COPY) || 663 !MovY->getOperand(1).isReg() || 664 MovY->getOperand(1).getReg() != T || 665 MovY->getOperand(1).getSubReg() != Tsub) 666 continue; 667 668 Register Y = MovY->getOperand(0).getReg(); 669 unsigned Ysub = MovY->getOperand(0).getSubReg(); 670 671 if (!TRI->isVGPR(*MRI, Y)) 672 continue; 673 674 MachineInstr *MovX = nullptr; 675 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 676 I != IY; ++I) { 677 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 678 instModifiesReg(&*I, T, Tsub) || 679 (MovX && instModifiesReg(&*I, X, Xsub))) { 680 MovX = nullptr; 681 break; 682 } 683 if (!instReadsReg(&*I, Y, Ysub)) { 684 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 685 MovX = nullptr; 686 break; 687 } 688 continue; 689 } 690 if (MovX || 691 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 692 I->getOpcode() != AMDGPU::COPY) || 693 I->getOperand(0).getReg() != X || 694 I->getOperand(0).getSubReg() != Xsub) { 695 MovX = nullptr; 696 break; 697 } 698 699 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 700 continue; 701 702 MovX = &*I; 703 } 704 705 if (!MovX) 706 continue; 707 708 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 709 710 for (unsigned I = 0; I < Size; ++I) { 711 TargetInstrInfo::RegSubRegPair X1, Y1; 712 X1 = getSubRegForIndex(X, Xsub, I); 713 Y1 = getSubRegForIndex(Y, Ysub, I); 714 MachineBasicBlock &MBB = *MovT.getParent(); 715 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 716 TII->get(AMDGPU::V_SWAP_B32)) 717 .addDef(X1.Reg, 0, X1.SubReg) 718 .addDef(Y1.Reg, 0, Y1.SubReg) 719 .addReg(Y1.Reg, 0, Y1.SubReg) 720 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 721 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 722 // Drop implicit EXEC. 723 MIB->removeOperand(MIB->getNumExplicitOperands()); 724 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 725 } 726 } 727 MovX->eraseFromParent(); 728 dropInstructionKeepingImpDefs(*MovY); 729 MachineInstr *Next = &*std::next(MovT.getIterator()); 730 731 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 732 dropInstructionKeepingImpDefs(MovT); 733 } else { 734 Xop.setIsKill(false); 735 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 736 unsigned OpNo = MovT.getNumExplicitOperands() + I; 737 const MachineOperand &Op = MovT.getOperand(OpNo); 738 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 739 MovT.removeOperand(OpNo); 740 } 741 } 742 743 return Next; 744 } 745 746 return nullptr; 747 } 748 749 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 750 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 751 if (!ST->hasGFX10_3Insts()) 752 return false; 753 754 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 755 if (!Op) 756 return false; 757 Register SDstReg = Op->getReg(); 758 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 759 return false; 760 761 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 762 return true; 763 } 764 765 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 766 if (skipFunction(MF.getFunction())) 767 return false; 768 769 this->MF = &MF; 770 MRI = &MF.getRegInfo(); 771 ST = &MF.getSubtarget<GCNSubtarget>(); 772 TII = ST->getInstrInfo(); 773 TRI = &TII->getRegisterInfo(); 774 775 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 776 777 std::vector<unsigned> I1Defs; 778 779 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 780 BI != BE; ++BI) { 781 782 MachineBasicBlock &MBB = *BI; 783 MachineBasicBlock::iterator I, Next; 784 for (I = MBB.begin(); I != MBB.end(); I = Next) { 785 Next = std::next(I); 786 MachineInstr &MI = *I; 787 788 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 789 // If this has a literal constant source that is the same as the 790 // reversed bits of an inline immediate, replace with a bitreverse of 791 // that constant. This saves 4 bytes in the common case of materializing 792 // sign bits. 793 794 // Test if we are after regalloc. We only want to do this after any 795 // optimizations happen because this will confuse them. 796 // XXX - not exactly a check for post-regalloc run. 797 MachineOperand &Src = MI.getOperand(1); 798 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 799 int32_t ReverseImm; 800 if (isReverseInlineImm(Src, ReverseImm)) { 801 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 802 Src.setImm(ReverseImm); 803 continue; 804 } 805 } 806 } 807 808 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 809 MI.getOpcode() == AMDGPU::COPY)) { 810 if (auto *NextMI = matchSwap(MI)) { 811 Next = NextMI->getIterator(); 812 continue; 813 } 814 } 815 816 // Try to use S_ADDK_I32 and S_MULK_I32. 817 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 818 MI.getOpcode() == AMDGPU::S_MUL_I32) { 819 const MachineOperand *Dest = &MI.getOperand(0); 820 MachineOperand *Src0 = &MI.getOperand(1); 821 MachineOperand *Src1 = &MI.getOperand(2); 822 823 if (!Src0->isReg() && Src1->isReg()) { 824 if (TII->commuteInstruction(MI, false, 1, 2)) 825 std::swap(Src0, Src1); 826 } 827 828 // FIXME: This could work better if hints worked with subregisters. If 829 // we have a vector add of a constant, we usually don't get the correct 830 // allocation due to the subregister usage. 831 if (Dest->getReg().isVirtual() && Src0->isReg()) { 832 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 833 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 834 continue; 835 } 836 837 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 838 if (Src1->isImm() && isKImmOperand(*Src1)) { 839 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 840 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 841 842 MI.setDesc(TII->get(Opc)); 843 MI.tieOperands(0, 1); 844 } 845 } 846 } 847 848 // Try to use s_cmpk_* 849 if (MI.isCompare() && TII->isSOPC(MI)) { 850 shrinkScalarCompare(MI); 851 continue; 852 } 853 854 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 855 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 856 const MachineOperand &Dst = MI.getOperand(0); 857 MachineOperand &Src = MI.getOperand(1); 858 859 if (Src.isImm() && Dst.getReg().isPhysical()) { 860 int32_t ReverseImm; 861 if (isKImmOperand(Src)) 862 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 863 else if (isReverseInlineImm(Src, ReverseImm)) { 864 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 865 Src.setImm(ReverseImm); 866 } 867 } 868 869 continue; 870 } 871 872 // Shrink scalar logic operations. 873 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 874 MI.getOpcode() == AMDGPU::S_OR_B32 || 875 MI.getOpcode() == AMDGPU::S_XOR_B32) { 876 if (shrinkScalarLogicOp(MI)) 877 continue; 878 } 879 880 if (TII->isMIMG(MI.getOpcode()) && 881 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 882 MF.getProperties().hasProperty( 883 MachineFunctionProperties::Property::NoVRegs)) { 884 shrinkMIMG(MI); 885 continue; 886 } 887 888 if (!TII->isVOP3(MI)) 889 continue; 890 891 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 892 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 893 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 894 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 895 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { 896 shrinkMadFma(MI); 897 continue; 898 } 899 900 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 901 // If there is no chance we will shrink it and use VCC as sdst to get 902 // a 32 bit form try to replace dead sdst with NULL. 903 tryReplaceDeadSDST(MI); 904 continue; 905 } 906 907 if (!TII->canShrink(MI, *MRI)) { 908 // Try commuting the instruction and see if that enables us to shrink 909 // it. 910 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 911 !TII->canShrink(MI, *MRI)) { 912 tryReplaceDeadSDST(MI); 913 continue; 914 } 915 } 916 917 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 918 919 if (TII->isVOPC(Op32)) { 920 MachineOperand &Op0 = MI.getOperand(0); 921 if (Op0.isReg()) { 922 // Exclude VOPCX instructions as these don't explicitly write a 923 // dst. 924 Register DstReg = Op0.getReg(); 925 if (DstReg.isVirtual()) { 926 // VOPC instructions can only write to the VCC register. We can't 927 // force them to use VCC here, because this is only one register and 928 // cannot deal with sequences which would require multiple copies of 929 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 930 // 931 // So, instead of forcing the instruction to write to VCC, we 932 // provide a hint to the register allocator to use VCC and then we 933 // will run this pass again after RA and shrink it if it outputs to 934 // VCC. 935 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 936 continue; 937 } 938 if (DstReg != VCCReg) 939 continue; 940 } 941 } 942 943 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 944 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 945 // instructions. 946 const MachineOperand *Src2 = 947 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 948 if (!Src2->isReg()) 949 continue; 950 Register SReg = Src2->getReg(); 951 if (SReg.isVirtual()) { 952 MRI->setRegAllocationHint(SReg, 0, VCCReg); 953 continue; 954 } 955 if (SReg != VCCReg) 956 continue; 957 } 958 959 // Check for the bool flag output for instructions like V_ADD_I32_e64. 960 const MachineOperand *SDst = TII->getNamedOperand(MI, 961 AMDGPU::OpName::sdst); 962 963 if (SDst) { 964 bool Next = false; 965 966 if (SDst->getReg() != VCCReg) { 967 if (SDst->getReg().isVirtual()) 968 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 969 Next = true; 970 } 971 972 // All of the instructions with carry outs also have an SGPR input in 973 // src2. 974 const MachineOperand *Src2 = TII->getNamedOperand(MI, 975 AMDGPU::OpName::src2); 976 if (Src2 && Src2->getReg() != VCCReg) { 977 if (Src2->getReg().isVirtual()) 978 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 979 Next = true; 980 } 981 982 if (Next) 983 continue; 984 } 985 986 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 987 // fold an immediate into the shrunk instruction as a literal operand. In 988 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 989 // no advantage to doing this. 990 if (ST->hasVOP3Literal() && 991 !MF.getProperties().hasProperty( 992 MachineFunctionProperties::Property::NoVRegs)) 993 continue; 994 995 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 996 !shouldShrinkTrue16(MI)) 997 continue; 998 999 // We can shrink this instruction 1000 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1001 1002 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1003 ++NumInstructionsShrunk; 1004 1005 // Copy extra operands not present in the instruction definition. 1006 copyExtraImplicitOps(*Inst32, MI); 1007 1008 // Copy deadness from the old explicit vcc def to the new implicit def. 1009 if (SDst && SDst->isDead()) 1010 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 1011 1012 MI.eraseFromParent(); 1013 foldImmediates(*Inst32); 1014 1015 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1016 } 1017 } 1018 return false; 1019 } 1020