1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "Utils/AMDGPUBaseInfo.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 18 #define DEBUG_TYPE "si-shrink-instructions" 19 20 STATISTIC(NumInstructionsShrunk, 21 "Number of 64-bit instruction reduced to 32-bit."); 22 STATISTIC(NumLiteralConstantsFolded, 23 "Number of literal constants folded into 32-bit instructions."); 24 25 using namespace llvm; 26 27 namespace { 28 29 class SIShrinkInstructions : public MachineFunctionPass { 30 MachineFunction *MF; 31 MachineRegisterInfo *MRI; 32 const GCNSubtarget *ST; 33 const SIInstrInfo *TII; 34 const SIRegisterInfo *TRI; 35 36 public: 37 static char ID; 38 39 public: 40 SIShrinkInstructions() : MachineFunctionPass(ID) { 41 } 42 43 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 44 bool shouldShrinkTrue16(MachineInstr &MI) const; 45 bool isKImmOperand(const MachineOperand &Src) const; 46 bool isKUImmOperand(const MachineOperand &Src) const; 47 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 48 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 49 void shrinkScalarCompare(MachineInstr &MI) const; 50 void shrinkMIMG(MachineInstr &MI) const; 51 void shrinkMadFma(MachineInstr &MI) const; 52 bool shrinkScalarLogicOp(MachineInstr &MI) const; 53 bool tryReplaceDeadSDST(MachineInstr &MI) const; 54 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 55 Register Reg, unsigned SubReg) const; 56 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 57 unsigned SubReg) const; 58 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 59 unsigned SubReg) const; 60 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 61 unsigned I) const; 62 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 63 MachineInstr *matchSwap(MachineInstr &MovT) const; 64 65 bool runOnMachineFunction(MachineFunction &MF) override; 66 67 StringRef getPassName() const override { return "SI Shrink Instructions"; } 68 69 void getAnalysisUsage(AnalysisUsage &AU) const override { 70 AU.setPreservesCFG(); 71 MachineFunctionPass::getAnalysisUsage(AU); 72 } 73 }; 74 75 } // End anonymous namespace. 76 77 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 78 "SI Shrink Instructions", false, false) 79 80 char SIShrinkInstructions::ID = 0; 81 82 FunctionPass *llvm::createSIShrinkInstructionsPass() { 83 return new SIShrinkInstructions(); 84 } 85 86 /// This function checks \p MI for operands defined by a move immediate 87 /// instruction and then folds the literal constant into the instruction if it 88 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 89 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 90 bool TryToCommute) const { 91 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 92 93 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 94 95 // Try to fold Src0 96 MachineOperand &Src0 = MI.getOperand(Src0Idx); 97 if (Src0.isReg()) { 98 Register Reg = Src0.getReg(); 99 if (Reg.isVirtual()) { 100 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 101 if (Def && Def->isMoveImmediate()) { 102 MachineOperand &MovSrc = Def->getOperand(1); 103 bool ConstantFolded = false; 104 105 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 106 if (MovSrc.isImm()) { 107 Src0.ChangeToImmediate(MovSrc.getImm()); 108 ConstantFolded = true; 109 } else if (MovSrc.isFI()) { 110 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 111 ConstantFolded = true; 112 } else if (MovSrc.isGlobal()) { 113 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 114 MovSrc.getTargetFlags()); 115 ConstantFolded = true; 116 } 117 } 118 119 if (ConstantFolded) { 120 if (MRI->use_nodbg_empty(Reg)) 121 Def->eraseFromParent(); 122 ++NumLiteralConstantsFolded; 123 return true; 124 } 125 } 126 } 127 } 128 129 // We have failed to fold src0, so commute the instruction and try again. 130 if (TryToCommute && MI.isCommutable()) { 131 if (TII->commuteInstruction(MI)) { 132 if (foldImmediates(MI, false)) 133 return true; 134 135 // Commute back. 136 TII->commuteInstruction(MI); 137 } 138 } 139 140 return false; 141 } 142 143 /// Do not shrink the instruction if its registers are not expressible in the 144 /// shrunk encoding. 145 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 146 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 147 const MachineOperand &MO = MI.getOperand(I); 148 if (MO.isReg()) { 149 Register Reg = MO.getReg(); 150 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 151 "True16 Instructions post-RA"); 152 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 153 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 154 return false; 155 } 156 } 157 return true; 158 } 159 160 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 161 return isInt<16>(SignExtend64(Src.getImm(), 32)) && 162 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 163 } 164 165 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 166 return isUInt<16>(Src.getImm()) && 167 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 168 } 169 170 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 171 bool &IsUnsigned) const { 172 if (isInt<16>(SignExtend64(Src.getImm(), 32))) { 173 IsUnsigned = false; 174 return !TII->isInlineConstant(Src); 175 } 176 177 if (isUInt<16>(Src.getImm())) { 178 IsUnsigned = true; 179 return !TII->isInlineConstant(Src); 180 } 181 182 return false; 183 } 184 185 /// \returns the opcode of an instruction a move immediate of the constant \p 186 /// Src can be replaced with if the constant is replaced with \p ModifiedImm. 187 /// i.e. 188 /// 189 /// If the bitreverse of a constant is an inline immediate, reverse the 190 /// immediate and return the bitreverse opcode. 191 /// 192 /// If the bitwise negation of a constant is an inline immediate, reverse the 193 /// immediate and return the bitwise not opcode. 194 static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, 195 const MachineOperand &Src, 196 int32_t &ModifiedImm, bool Scalar) { 197 if (TII->isInlineConstant(Src)) 198 return 0; 199 int32_t SrcImm = static_cast<int32_t>(Src.getImm()); 200 201 if (!Scalar) { 202 // We could handle the scalar case with here, but we would need to check 203 // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth 204 // it, as the reasonable values are already covered by s_movk_i32. 205 ModifiedImm = ~SrcImm; 206 if (TII->isInlineConstant(APInt(32, ModifiedImm))) 207 return AMDGPU::V_NOT_B32_e32; 208 } 209 210 ModifiedImm = reverseBits<int32_t>(SrcImm); 211 if (TII->isInlineConstant(APInt(32, ModifiedImm))) 212 return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; 213 214 return 0; 215 } 216 217 /// Copy implicit register operands from specified instruction to this 218 /// instruction that are not part of the instruction definition. 219 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 220 MachineInstr &MI) const { 221 MachineFunction &MF = *MI.getMF(); 222 for (unsigned i = MI.getDesc().getNumOperands() + 223 MI.getDesc().implicit_uses().size() + 224 MI.getDesc().implicit_defs().size(), 225 e = MI.getNumOperands(); 226 i != e; ++i) { 227 const MachineOperand &MO = MI.getOperand(i); 228 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 229 NewMI.addOperand(MF, MO); 230 } 231 } 232 233 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 234 if (!ST->hasSCmpK()) 235 return; 236 237 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 238 // get constants on the RHS. 239 if (!MI.getOperand(0).isReg()) 240 TII->commuteInstruction(MI, false, 0, 1); 241 242 // cmpk requires src0 to be a register 243 const MachineOperand &Src0 = MI.getOperand(0); 244 if (!Src0.isReg()) 245 return; 246 247 MachineOperand &Src1 = MI.getOperand(1); 248 if (!Src1.isImm()) 249 return; 250 251 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 252 if (SOPKOpc == -1) 253 return; 254 255 // eq/ne is special because the imm16 can be treated as signed or unsigned, 256 // and initially selected to the unsigned versions. 257 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 258 bool HasUImm; 259 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 260 if (!HasUImm) { 261 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 262 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 263 Src1.setImm(SignExtend32(Src1.getImm(), 32)); 264 } 265 266 MI.setDesc(TII->get(SOPKOpc)); 267 } 268 269 return; 270 } 271 272 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 273 274 if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 275 (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 276 if (!SIInstrInfo::sopkIsZext(SOPKOpc)) 277 Src1.setImm(SignExtend64(Src1.getImm(), 32)); 278 MI.setDesc(NewDesc); 279 } 280 } 281 282 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 283 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 284 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 285 if (!Info) 286 return; 287 288 uint8_t NewEncoding; 289 switch (Info->MIMGEncoding) { 290 case AMDGPU::MIMGEncGfx10NSA: 291 NewEncoding = AMDGPU::MIMGEncGfx10Default; 292 break; 293 case AMDGPU::MIMGEncGfx11NSA: 294 NewEncoding = AMDGPU::MIMGEncGfx11Default; 295 break; 296 default: 297 return; 298 } 299 300 int VAddr0Idx = 301 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 302 unsigned NewAddrDwords = Info->VAddrDwords; 303 const TargetRegisterClass *RC; 304 305 if (Info->VAddrDwords == 2) { 306 RC = &AMDGPU::VReg_64RegClass; 307 } else if (Info->VAddrDwords == 3) { 308 RC = &AMDGPU::VReg_96RegClass; 309 } else if (Info->VAddrDwords == 4) { 310 RC = &AMDGPU::VReg_128RegClass; 311 } else if (Info->VAddrDwords == 5) { 312 RC = &AMDGPU::VReg_160RegClass; 313 } else if (Info->VAddrDwords == 6) { 314 RC = &AMDGPU::VReg_192RegClass; 315 } else if (Info->VAddrDwords == 7) { 316 RC = &AMDGPU::VReg_224RegClass; 317 } else if (Info->VAddrDwords == 8) { 318 RC = &AMDGPU::VReg_256RegClass; 319 } else if (Info->VAddrDwords == 9) { 320 RC = &AMDGPU::VReg_288RegClass; 321 } else if (Info->VAddrDwords == 10) { 322 RC = &AMDGPU::VReg_320RegClass; 323 } else if (Info->VAddrDwords == 11) { 324 RC = &AMDGPU::VReg_352RegClass; 325 } else if (Info->VAddrDwords == 12) { 326 RC = &AMDGPU::VReg_384RegClass; 327 } else { 328 RC = &AMDGPU::VReg_512RegClass; 329 NewAddrDwords = 16; 330 } 331 332 unsigned VgprBase = 0; 333 unsigned NextVgpr = 0; 334 bool IsUndef = true; 335 bool IsKill = NewAddrDwords == Info->VAddrDwords; 336 const unsigned NSAMaxSize = ST->getNSAMaxSize(); 337 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; 338 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; 339 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { 340 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 341 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 342 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 343 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 344 345 if (Idx == 0) { 346 VgprBase = Vgpr; 347 NextVgpr = Vgpr + Dwords; 348 } else if (Vgpr == NextVgpr) { 349 NextVgpr = Vgpr + Dwords; 350 } else { 351 return; 352 } 353 354 if (!Op.isUndef()) 355 IsUndef = false; 356 if (!Op.isKill()) 357 IsKill = false; 358 } 359 360 if (VgprBase + NewAddrDwords > 256) 361 return; 362 363 // Further check for implicit tied operands - this may be present if TFE is 364 // enabled 365 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 366 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 367 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 368 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 369 int ToUntie = -1; 370 if (TFEVal || LWEVal) { 371 // TFE/LWE is enabled so we need to deal with an implicit tied operand 372 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 373 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 374 MI.getOperand(i).isImplicit()) { 375 // This is the tied operand 376 assert( 377 ToUntie == -1 && 378 "found more than one tied implicit operand when expecting only 1"); 379 ToUntie = i; 380 MI.untieRegOperand(ToUntie); 381 } 382 } 383 } 384 385 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 386 Info->VDataDwords, NewAddrDwords); 387 MI.setDesc(TII->get(NewOpcode)); 388 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 389 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 390 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 391 392 for (unsigned i = 1; i < EndVAddr; ++i) 393 MI.removeOperand(VAddr0Idx + 1); 394 395 if (ToUntie >= 0) { 396 MI.tieOperands( 397 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 398 ToUntie - (EndVAddr - 1)); 399 } 400 } 401 402 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 403 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 404 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 405 // there is no reason to try to shrink them. 406 if (!ST->hasVOP3Literal()) 407 return; 408 409 // There is no advantage to doing this pre-RA. 410 if (!MF->getProperties().hasProperty( 411 MachineFunctionProperties::Property::NoVRegs)) 412 return; 413 414 if (TII->hasAnyModifiersSet(MI)) 415 return; 416 417 const unsigned Opcode = MI.getOpcode(); 418 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 419 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 420 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 421 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 422 423 bool Swap; 424 425 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 426 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 427 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 428 Swap = false; 429 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 430 Swap = true; 431 else 432 return; 433 434 switch (Opcode) { 435 default: 436 llvm_unreachable("Unexpected mad/fma opcode!"); 437 case AMDGPU::V_MAD_F32_e64: 438 NewOpcode = AMDGPU::V_MADAK_F32; 439 break; 440 case AMDGPU::V_FMA_F32_e64: 441 NewOpcode = AMDGPU::V_FMAAK_F32; 442 break; 443 case AMDGPU::V_MAD_F16_e64: 444 NewOpcode = AMDGPU::V_MADAK_F16; 445 break; 446 case AMDGPU::V_FMA_F16_e64: 447 case AMDGPU::V_FMA_F16_gfx9_e64: 448 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 449 : AMDGPU::V_FMAAK_F16; 450 break; 451 } 452 } 453 454 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 455 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 456 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 457 Swap = false; 458 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 459 Swap = true; 460 else 461 return; 462 463 switch (Opcode) { 464 default: 465 llvm_unreachable("Unexpected mad/fma opcode!"); 466 case AMDGPU::V_MAD_F32_e64: 467 NewOpcode = AMDGPU::V_MADMK_F32; 468 break; 469 case AMDGPU::V_FMA_F32_e64: 470 NewOpcode = AMDGPU::V_FMAMK_F32; 471 break; 472 case AMDGPU::V_MAD_F16_e64: 473 NewOpcode = AMDGPU::V_MADMK_F16; 474 break; 475 case AMDGPU::V_FMA_F16_e64: 476 case AMDGPU::V_FMA_F16_gfx9_e64: 477 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 478 : AMDGPU::V_FMAMK_F16; 479 break; 480 } 481 } 482 483 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 484 return; 485 486 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 487 return; 488 489 if (Swap) { 490 // Swap Src0 and Src1 by building a new instruction. 491 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 492 MI.getOperand(0).getReg()) 493 .add(Src1) 494 .add(Src0) 495 .add(Src2) 496 .setMIFlags(MI.getFlags()); 497 MI.eraseFromParent(); 498 } else { 499 TII->removeModOperands(MI); 500 MI.setDesc(TII->get(NewOpcode)); 501 } 502 } 503 504 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. 505 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 506 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 507 /// XNOR (as a ^ b == ~(a ^ ~b)). 508 /// \returns true if the caller should continue the machine function iterator 509 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 510 unsigned Opc = MI.getOpcode(); 511 const MachineOperand *Dest = &MI.getOperand(0); 512 MachineOperand *Src0 = &MI.getOperand(1); 513 MachineOperand *Src1 = &MI.getOperand(2); 514 MachineOperand *SrcReg = Src0; 515 MachineOperand *SrcImm = Src1; 516 517 if (!SrcImm->isImm() || 518 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 519 return false; 520 521 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 522 uint32_t NewImm = 0; 523 524 if (Opc == AMDGPU::S_AND_B32) { 525 if (isPowerOf2_32(~Imm)) { 526 NewImm = llvm::countr_one(Imm); 527 Opc = AMDGPU::S_BITSET0_B32; 528 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 529 NewImm = ~Imm; 530 Opc = AMDGPU::S_ANDN2_B32; 531 } 532 } else if (Opc == AMDGPU::S_OR_B32) { 533 if (isPowerOf2_32(Imm)) { 534 NewImm = llvm::countr_zero(Imm); 535 Opc = AMDGPU::S_BITSET1_B32; 536 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 537 NewImm = ~Imm; 538 Opc = AMDGPU::S_ORN2_B32; 539 } 540 } else if (Opc == AMDGPU::S_XOR_B32) { 541 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 542 NewImm = ~Imm; 543 Opc = AMDGPU::S_XNOR_B32; 544 } 545 } else { 546 llvm_unreachable("unexpected opcode"); 547 } 548 549 if (NewImm != 0) { 550 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 551 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 552 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 553 return true; 554 } 555 556 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 557 const bool IsUndef = SrcReg->isUndef(); 558 const bool IsKill = SrcReg->isKill(); 559 MI.setDesc(TII->get(Opc)); 560 if (Opc == AMDGPU::S_BITSET0_B32 || 561 Opc == AMDGPU::S_BITSET1_B32) { 562 Src0->ChangeToImmediate(NewImm); 563 // Remove the immediate and add the tied input. 564 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 565 /*isImp*/ false, IsKill, 566 /*isDead*/ false, IsUndef); 567 MI.tieOperands(0, 2); 568 } else { 569 SrcImm->setImm(NewImm); 570 } 571 } 572 } 573 574 return false; 575 } 576 577 // This is the same as MachineInstr::readsRegister/modifiesRegister except 578 // it takes subregs into account. 579 bool SIShrinkInstructions::instAccessReg( 580 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 581 unsigned SubReg) const { 582 for (const MachineOperand &MO : R) { 583 if (!MO.isReg()) 584 continue; 585 586 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 587 if (TRI->regsOverlap(Reg, MO.getReg())) 588 return true; 589 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 590 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 591 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 592 if (Overlap.any()) 593 return true; 594 } 595 } 596 return false; 597 } 598 599 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 600 unsigned SubReg) const { 601 return instAccessReg(MI->uses(), Reg, SubReg); 602 } 603 604 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 605 unsigned SubReg) const { 606 return instAccessReg(MI->defs(), Reg, SubReg); 607 } 608 609 TargetInstrInfo::RegSubRegPair 610 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 611 unsigned I) const { 612 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 613 if (Reg.isPhysical()) { 614 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 615 } else { 616 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 617 } 618 } 619 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 620 } 621 622 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 623 MachineInstr &MI) const { 624 for (unsigned i = MI.getDesc().getNumOperands() + 625 MI.getDesc().implicit_uses().size() + 626 MI.getDesc().implicit_defs().size(), 627 e = MI.getNumOperands(); 628 i != e; ++i) { 629 const MachineOperand &Op = MI.getOperand(i); 630 if (!Op.isDef()) 631 continue; 632 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 633 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 634 } 635 636 MI.eraseFromParent(); 637 } 638 639 // Match: 640 // mov t, x 641 // mov x, y 642 // mov y, t 643 // 644 // => 645 // 646 // mov t, x (t is potentially dead and move eliminated) 647 // v_swap_b32 x, y 648 // 649 // Returns next valid instruction pointer if was able to create v_swap_b32. 650 // 651 // This shall not be done too early not to prevent possible folding which may 652 // remove matched moves, and this should preferably be done before RA to 653 // release saved registers and also possibly after RA which can insert copies 654 // too. 655 // 656 // This is really just a generic peephole that is not a canonical shrinking, 657 // although requirements match the pass placement and it reduces code size too. 658 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 659 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 660 MovT.getOpcode() == AMDGPU::COPY); 661 662 Register T = MovT.getOperand(0).getReg(); 663 unsigned Tsub = MovT.getOperand(0).getSubReg(); 664 MachineOperand &Xop = MovT.getOperand(1); 665 666 if (!Xop.isReg()) 667 return nullptr; 668 Register X = Xop.getReg(); 669 unsigned Xsub = Xop.getSubReg(); 670 671 unsigned Size = TII->getOpSize(MovT, 0) / 4; 672 673 if (!TRI->isVGPR(*MRI, X)) 674 return nullptr; 675 676 const unsigned SearchLimit = 16; 677 unsigned Count = 0; 678 bool KilledT = false; 679 for (auto Iter = std::next(MovT.getIterator()), 680 E = MovT.getParent()->instr_end(); 681 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 682 683 MachineInstr *MovY = &*Iter; 684 KilledT = MovY->killsRegister(T, TRI); 685 686 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 687 MovY->getOpcode() != AMDGPU::COPY) || 688 !MovY->getOperand(1).isReg() || 689 MovY->getOperand(1).getReg() != T || 690 MovY->getOperand(1).getSubReg() != Tsub) 691 continue; 692 693 Register Y = MovY->getOperand(0).getReg(); 694 unsigned Ysub = MovY->getOperand(0).getSubReg(); 695 696 if (!TRI->isVGPR(*MRI, Y)) 697 continue; 698 699 MachineInstr *MovX = nullptr; 700 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 701 I != IY; ++I) { 702 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 703 instModifiesReg(&*I, T, Tsub) || 704 (MovX && instModifiesReg(&*I, X, Xsub))) { 705 MovX = nullptr; 706 break; 707 } 708 if (!instReadsReg(&*I, Y, Ysub)) { 709 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 710 MovX = nullptr; 711 break; 712 } 713 continue; 714 } 715 if (MovX || 716 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 717 I->getOpcode() != AMDGPU::COPY) || 718 I->getOperand(0).getReg() != X || 719 I->getOperand(0).getSubReg() != Xsub) { 720 MovX = nullptr; 721 break; 722 } 723 724 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 725 continue; 726 727 MovX = &*I; 728 } 729 730 if (!MovX) 731 continue; 732 733 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 734 735 for (unsigned I = 0; I < Size; ++I) { 736 TargetInstrInfo::RegSubRegPair X1, Y1; 737 X1 = getSubRegForIndex(X, Xsub, I); 738 Y1 = getSubRegForIndex(Y, Ysub, I); 739 MachineBasicBlock &MBB = *MovT.getParent(); 740 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 741 TII->get(AMDGPU::V_SWAP_B32)) 742 .addDef(X1.Reg, 0, X1.SubReg) 743 .addDef(Y1.Reg, 0, Y1.SubReg) 744 .addReg(Y1.Reg, 0, Y1.SubReg) 745 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 746 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 747 // Drop implicit EXEC. 748 MIB->removeOperand(MIB->getNumExplicitOperands()); 749 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 750 } 751 } 752 MovX->eraseFromParent(); 753 dropInstructionKeepingImpDefs(*MovY); 754 MachineInstr *Next = &*std::next(MovT.getIterator()); 755 756 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 757 dropInstructionKeepingImpDefs(MovT); 758 } else { 759 Xop.setIsKill(false); 760 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 761 unsigned OpNo = MovT.getNumExplicitOperands() + I; 762 const MachineOperand &Op = MovT.getOperand(OpNo); 763 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 764 MovT.removeOperand(OpNo); 765 } 766 } 767 768 return Next; 769 } 770 771 return nullptr; 772 } 773 774 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 775 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 776 if (!ST->hasGFX10_3Insts()) 777 return false; 778 779 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 780 if (!Op) 781 return false; 782 Register SDstReg = Op->getReg(); 783 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 784 return false; 785 786 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 787 return true; 788 } 789 790 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 791 if (skipFunction(MF.getFunction())) 792 return false; 793 794 this->MF = &MF; 795 MRI = &MF.getRegInfo(); 796 ST = &MF.getSubtarget<GCNSubtarget>(); 797 TII = ST->getInstrInfo(); 798 TRI = &TII->getRegisterInfo(); 799 800 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 801 802 std::vector<unsigned> I1Defs; 803 804 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 805 BI != BE; ++BI) { 806 807 MachineBasicBlock &MBB = *BI; 808 MachineBasicBlock::iterator I, Next; 809 for (I = MBB.begin(); I != MBB.end(); I = Next) { 810 Next = std::next(I); 811 MachineInstr &MI = *I; 812 813 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 814 // If this has a literal constant source that is the same as the 815 // reversed bits of an inline immediate, replace with a bitreverse of 816 // that constant. This saves 4 bytes in the common case of materializing 817 // sign bits. 818 819 // Test if we are after regalloc. We only want to do this after any 820 // optimizations happen because this will confuse them. 821 // XXX - not exactly a check for post-regalloc run. 822 MachineOperand &Src = MI.getOperand(1); 823 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 824 int32_t ModImm; 825 unsigned ModOpcode = 826 canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false); 827 if (ModOpcode != 0) { 828 MI.setDesc(TII->get(ModOpcode)); 829 Src.setImm(static_cast<int64_t>(ModImm)); 830 continue; 831 } 832 } 833 } 834 835 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 836 MI.getOpcode() == AMDGPU::COPY)) { 837 if (auto *NextMI = matchSwap(MI)) { 838 Next = NextMI->getIterator(); 839 continue; 840 } 841 } 842 843 // Try to use S_ADDK_I32 and S_MULK_I32. 844 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 845 MI.getOpcode() == AMDGPU::S_MUL_I32) { 846 const MachineOperand *Dest = &MI.getOperand(0); 847 MachineOperand *Src0 = &MI.getOperand(1); 848 MachineOperand *Src1 = &MI.getOperand(2); 849 850 if (!Src0->isReg() && Src1->isReg()) { 851 if (TII->commuteInstruction(MI, false, 1, 2)) 852 std::swap(Src0, Src1); 853 } 854 855 // FIXME: This could work better if hints worked with subregisters. If 856 // we have a vector add of a constant, we usually don't get the correct 857 // allocation due to the subregister usage. 858 if (Dest->getReg().isVirtual() && Src0->isReg()) { 859 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 860 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 861 continue; 862 } 863 864 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 865 if (Src1->isImm() && isKImmOperand(*Src1)) { 866 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 867 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 868 869 Src1->setImm(SignExtend64(Src1->getImm(), 32)); 870 MI.setDesc(TII->get(Opc)); 871 MI.tieOperands(0, 1); 872 } 873 } 874 } 875 876 // Try to use s_cmpk_* 877 if (MI.isCompare() && TII->isSOPC(MI)) { 878 shrinkScalarCompare(MI); 879 continue; 880 } 881 882 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 883 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 884 const MachineOperand &Dst = MI.getOperand(0); 885 MachineOperand &Src = MI.getOperand(1); 886 887 if (Src.isImm() && Dst.getReg().isPhysical()) { 888 unsigned ModOpc; 889 int32_t ModImm; 890 if (isKImmOperand(Src)) { 891 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 892 Src.setImm(SignExtend64(Src.getImm(), 32)); 893 } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, 894 /*Scalar=*/true))) { 895 MI.setDesc(TII->get(ModOpc)); 896 Src.setImm(static_cast<int64_t>(ModImm)); 897 } 898 } 899 900 continue; 901 } 902 903 // Shrink scalar logic operations. 904 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 905 MI.getOpcode() == AMDGPU::S_OR_B32 || 906 MI.getOpcode() == AMDGPU::S_XOR_B32) { 907 if (shrinkScalarLogicOp(MI)) 908 continue; 909 } 910 911 if (TII->isMIMG(MI.getOpcode()) && 912 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 913 MF.getProperties().hasProperty( 914 MachineFunctionProperties::Property::NoVRegs)) { 915 shrinkMIMG(MI); 916 continue; 917 } 918 919 if (!TII->isVOP3(MI)) 920 continue; 921 922 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 923 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 924 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 925 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 926 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { 927 shrinkMadFma(MI); 928 continue; 929 } 930 931 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 932 // If there is no chance we will shrink it and use VCC as sdst to get 933 // a 32 bit form try to replace dead sdst with NULL. 934 tryReplaceDeadSDST(MI); 935 continue; 936 } 937 938 if (!TII->canShrink(MI, *MRI)) { 939 // Try commuting the instruction and see if that enables us to shrink 940 // it. 941 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 942 !TII->canShrink(MI, *MRI)) { 943 tryReplaceDeadSDST(MI); 944 continue; 945 } 946 } 947 948 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 949 950 if (TII->isVOPC(Op32)) { 951 MachineOperand &Op0 = MI.getOperand(0); 952 if (Op0.isReg()) { 953 // Exclude VOPCX instructions as these don't explicitly write a 954 // dst. 955 Register DstReg = Op0.getReg(); 956 if (DstReg.isVirtual()) { 957 // VOPC instructions can only write to the VCC register. We can't 958 // force them to use VCC here, because this is only one register and 959 // cannot deal with sequences which would require multiple copies of 960 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 961 // 962 // So, instead of forcing the instruction to write to VCC, we 963 // provide a hint to the register allocator to use VCC and then we 964 // will run this pass again after RA and shrink it if it outputs to 965 // VCC. 966 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 967 continue; 968 } 969 if (DstReg != VCCReg) 970 continue; 971 } 972 } 973 974 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 975 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 976 // instructions. 977 const MachineOperand *Src2 = 978 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 979 if (!Src2->isReg()) 980 continue; 981 Register SReg = Src2->getReg(); 982 if (SReg.isVirtual()) { 983 MRI->setRegAllocationHint(SReg, 0, VCCReg); 984 continue; 985 } 986 if (SReg != VCCReg) 987 continue; 988 } 989 990 // Check for the bool flag output for instructions like V_ADD_I32_e64. 991 const MachineOperand *SDst = TII->getNamedOperand(MI, 992 AMDGPU::OpName::sdst); 993 994 if (SDst) { 995 bool Next = false; 996 997 if (SDst->getReg() != VCCReg) { 998 if (SDst->getReg().isVirtual()) 999 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 1000 Next = true; 1001 } 1002 1003 // All of the instructions with carry outs also have an SGPR input in 1004 // src2. 1005 const MachineOperand *Src2 = TII->getNamedOperand(MI, 1006 AMDGPU::OpName::src2); 1007 if (Src2 && Src2->getReg() != VCCReg) { 1008 if (Src2->getReg().isVirtual()) 1009 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 1010 Next = true; 1011 } 1012 1013 if (Next) 1014 continue; 1015 } 1016 1017 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 1018 // fold an immediate into the shrunk instruction as a literal operand. In 1019 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 1020 // no advantage to doing this. 1021 if (ST->hasVOP3Literal() && 1022 !MF.getProperties().hasProperty( 1023 MachineFunctionProperties::Property::NoVRegs)) 1024 continue; 1025 1026 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 1027 !shouldShrinkTrue16(MI)) 1028 continue; 1029 1030 // We can shrink this instruction 1031 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1032 1033 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1034 ++NumInstructionsShrunk; 1035 1036 // Copy extra operands not present in the instruction definition. 1037 copyExtraImplicitOps(*Inst32, MI); 1038 1039 // Copy deadness from the old explicit vcc def to the new implicit def. 1040 if (SDst && SDst->isDead()) 1041 Inst32->findRegisterDefOperand(VCCReg, /*TRI=*/nullptr)->setIsDead(); 1042 1043 MI.eraseFromParent(); 1044 foldImmediates(*Inst32); 1045 1046 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1047 } 1048 } 1049 return false; 1050 } 1051