1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "Utils/AMDGPUBaseInfo.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 18 #define DEBUG_TYPE "si-shrink-instructions" 19 20 STATISTIC(NumInstructionsShrunk, 21 "Number of 64-bit instruction reduced to 32-bit."); 22 STATISTIC(NumLiteralConstantsFolded, 23 "Number of literal constants folded into 32-bit instructions."); 24 25 using namespace llvm; 26 27 namespace { 28 29 class SIShrinkInstructions : public MachineFunctionPass { 30 MachineFunction *MF; 31 MachineRegisterInfo *MRI; 32 const GCNSubtarget *ST; 33 const SIInstrInfo *TII; 34 const SIRegisterInfo *TRI; 35 36 public: 37 static char ID; 38 39 public: 40 SIShrinkInstructions() : MachineFunctionPass(ID) { 41 } 42 43 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 44 bool shouldShrinkTrue16(MachineInstr &MI) const; 45 bool isKImmOperand(const MachineOperand &Src) const; 46 bool isKUImmOperand(const MachineOperand &Src) const; 47 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 48 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 49 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 50 void shrinkScalarCompare(MachineInstr &MI) const; 51 void shrinkMIMG(MachineInstr &MI) const; 52 void shrinkMadFma(MachineInstr &MI) const; 53 bool shrinkScalarLogicOp(MachineInstr &MI) const; 54 bool tryReplaceDeadSDST(MachineInstr &MI) const; 55 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 56 Register Reg, unsigned SubReg) const; 57 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 58 unsigned SubReg) const; 59 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 60 unsigned SubReg) const; 61 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 62 unsigned I) const; 63 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 64 MachineInstr *matchSwap(MachineInstr &MovT) const; 65 66 bool runOnMachineFunction(MachineFunction &MF) override; 67 68 StringRef getPassName() const override { return "SI Shrink Instructions"; } 69 70 void getAnalysisUsage(AnalysisUsage &AU) const override { 71 AU.setPreservesCFG(); 72 MachineFunctionPass::getAnalysisUsage(AU); 73 } 74 }; 75 76 } // End anonymous namespace. 77 78 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 79 "SI Shrink Instructions", false, false) 80 81 char SIShrinkInstructions::ID = 0; 82 83 FunctionPass *llvm::createSIShrinkInstructionsPass() { 84 return new SIShrinkInstructions(); 85 } 86 87 /// This function checks \p MI for operands defined by a move immediate 88 /// instruction and then folds the literal constant into the instruction if it 89 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 90 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 91 bool TryToCommute) const { 92 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 93 94 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 95 96 // Try to fold Src0 97 MachineOperand &Src0 = MI.getOperand(Src0Idx); 98 if (Src0.isReg()) { 99 Register Reg = Src0.getReg(); 100 if (Reg.isVirtual()) { 101 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 102 if (Def && Def->isMoveImmediate()) { 103 MachineOperand &MovSrc = Def->getOperand(1); 104 bool ConstantFolded = false; 105 106 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 107 if (MovSrc.isImm()) { 108 Src0.ChangeToImmediate(MovSrc.getImm()); 109 ConstantFolded = true; 110 } else if (MovSrc.isFI()) { 111 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 112 ConstantFolded = true; 113 } else if (MovSrc.isGlobal()) { 114 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 115 MovSrc.getTargetFlags()); 116 ConstantFolded = true; 117 } 118 } 119 120 if (ConstantFolded) { 121 if (MRI->use_nodbg_empty(Reg)) 122 Def->eraseFromParent(); 123 ++NumLiteralConstantsFolded; 124 return true; 125 } 126 } 127 } 128 } 129 130 // We have failed to fold src0, so commute the instruction and try again. 131 if (TryToCommute && MI.isCommutable()) { 132 if (TII->commuteInstruction(MI)) { 133 if (foldImmediates(MI, false)) 134 return true; 135 136 // Commute back. 137 TII->commuteInstruction(MI); 138 } 139 } 140 141 return false; 142 } 143 144 /// Do not shrink the instruction if its registers are not expressible in the 145 /// shrunk encoding. 146 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 147 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 148 const MachineOperand &MO = MI.getOperand(I); 149 if (MO.isReg()) { 150 Register Reg = MO.getReg(); 151 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 152 "True16 Instructions post-RA"); 153 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 154 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 155 return false; 156 } 157 } 158 return true; 159 } 160 161 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 162 return isInt<16>(SignExtend64(Src.getImm(), 32)) && 163 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 164 } 165 166 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 167 return isUInt<16>(Src.getImm()) && 168 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 169 } 170 171 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 172 bool &IsUnsigned) const { 173 if (isInt<16>(SignExtend64(Src.getImm(), 32))) { 174 IsUnsigned = false; 175 return !TII->isInlineConstant(Src); 176 } 177 178 if (isUInt<16>(Src.getImm())) { 179 IsUnsigned = true; 180 return !TII->isInlineConstant(Src); 181 } 182 183 return false; 184 } 185 186 /// \returns true if the constant in \p Src should be replaced with a bitreverse 187 /// of an inline immediate. 188 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 189 int32_t &ReverseImm) const { 190 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 191 return false; 192 193 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 194 return ReverseImm >= -16 && ReverseImm <= 64; 195 } 196 197 /// Copy implicit register operands from specified instruction to this 198 /// instruction that are not part of the instruction definition. 199 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 200 MachineInstr &MI) const { 201 MachineFunction &MF = *MI.getMF(); 202 for (unsigned i = MI.getDesc().getNumOperands() + 203 MI.getDesc().implicit_uses().size() + 204 MI.getDesc().implicit_defs().size(), 205 e = MI.getNumOperands(); 206 i != e; ++i) { 207 const MachineOperand &MO = MI.getOperand(i); 208 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 209 NewMI.addOperand(MF, MO); 210 } 211 } 212 213 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 214 if (!ST->hasSCmpK()) 215 return; 216 217 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 218 // get constants on the RHS. 219 if (!MI.getOperand(0).isReg()) 220 TII->commuteInstruction(MI, false, 0, 1); 221 222 // cmpk requires src0 to be a register 223 const MachineOperand &Src0 = MI.getOperand(0); 224 if (!Src0.isReg()) 225 return; 226 227 MachineOperand &Src1 = MI.getOperand(1); 228 if (!Src1.isImm()) 229 return; 230 231 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 232 if (SOPKOpc == -1) 233 return; 234 235 // eq/ne is special because the imm16 can be treated as signed or unsigned, 236 // and initially selected to the unsigned versions. 237 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 238 bool HasUImm; 239 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 240 if (!HasUImm) { 241 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 242 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 243 Src1.setImm(SignExtend32(Src1.getImm(), 32)); 244 } 245 246 MI.setDesc(TII->get(SOPKOpc)); 247 } 248 249 return; 250 } 251 252 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 253 254 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 255 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 256 if (!TII->sopkIsZext(SOPKOpc)) 257 Src1.setImm(SignExtend64(Src1.getImm(), 32)); 258 MI.setDesc(NewDesc); 259 } 260 } 261 262 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 263 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 264 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 265 if (!Info) 266 return; 267 268 uint8_t NewEncoding; 269 switch (Info->MIMGEncoding) { 270 case AMDGPU::MIMGEncGfx10NSA: 271 NewEncoding = AMDGPU::MIMGEncGfx10Default; 272 break; 273 case AMDGPU::MIMGEncGfx11NSA: 274 NewEncoding = AMDGPU::MIMGEncGfx11Default; 275 break; 276 default: 277 return; 278 } 279 280 int VAddr0Idx = 281 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 282 unsigned NewAddrDwords = Info->VAddrDwords; 283 const TargetRegisterClass *RC; 284 285 if (Info->VAddrDwords == 2) { 286 RC = &AMDGPU::VReg_64RegClass; 287 } else if (Info->VAddrDwords == 3) { 288 RC = &AMDGPU::VReg_96RegClass; 289 } else if (Info->VAddrDwords == 4) { 290 RC = &AMDGPU::VReg_128RegClass; 291 } else if (Info->VAddrDwords == 5) { 292 RC = &AMDGPU::VReg_160RegClass; 293 } else if (Info->VAddrDwords == 6) { 294 RC = &AMDGPU::VReg_192RegClass; 295 } else if (Info->VAddrDwords == 7) { 296 RC = &AMDGPU::VReg_224RegClass; 297 } else if (Info->VAddrDwords == 8) { 298 RC = &AMDGPU::VReg_256RegClass; 299 } else if (Info->VAddrDwords == 9) { 300 RC = &AMDGPU::VReg_288RegClass; 301 } else if (Info->VAddrDwords == 10) { 302 RC = &AMDGPU::VReg_320RegClass; 303 } else if (Info->VAddrDwords == 11) { 304 RC = &AMDGPU::VReg_352RegClass; 305 } else if (Info->VAddrDwords == 12) { 306 RC = &AMDGPU::VReg_384RegClass; 307 } else { 308 RC = &AMDGPU::VReg_512RegClass; 309 NewAddrDwords = 16; 310 } 311 312 unsigned VgprBase = 0; 313 unsigned NextVgpr = 0; 314 bool IsUndef = true; 315 bool IsKill = NewAddrDwords == Info->VAddrDwords; 316 const unsigned NSAMaxSize = ST->getNSAMaxSize(); 317 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; 318 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; 319 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { 320 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 321 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 322 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 323 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 324 325 if (Idx == 0) { 326 VgprBase = Vgpr; 327 NextVgpr = Vgpr + Dwords; 328 } else if (Vgpr == NextVgpr) { 329 NextVgpr = Vgpr + Dwords; 330 } else { 331 return; 332 } 333 334 if (!Op.isUndef()) 335 IsUndef = false; 336 if (!Op.isKill()) 337 IsKill = false; 338 } 339 340 if (VgprBase + NewAddrDwords > 256) 341 return; 342 343 // Further check for implicit tied operands - this may be present if TFE is 344 // enabled 345 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 346 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 347 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 348 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 349 int ToUntie = -1; 350 if (TFEVal || LWEVal) { 351 // TFE/LWE is enabled so we need to deal with an implicit tied operand 352 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 353 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 354 MI.getOperand(i).isImplicit()) { 355 // This is the tied operand 356 assert( 357 ToUntie == -1 && 358 "found more than one tied implicit operand when expecting only 1"); 359 ToUntie = i; 360 MI.untieRegOperand(ToUntie); 361 } 362 } 363 } 364 365 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 366 Info->VDataDwords, NewAddrDwords); 367 MI.setDesc(TII->get(NewOpcode)); 368 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 369 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 370 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 371 372 for (unsigned i = 1; i < EndVAddr; ++i) 373 MI.removeOperand(VAddr0Idx + 1); 374 375 if (ToUntie >= 0) { 376 MI.tieOperands( 377 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 378 ToUntie - (EndVAddr - 1)); 379 } 380 } 381 382 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 383 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 384 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 385 // there is no reason to try to shrink them. 386 if (!ST->hasVOP3Literal()) 387 return; 388 389 // There is no advantage to doing this pre-RA. 390 if (!MF->getProperties().hasProperty( 391 MachineFunctionProperties::Property::NoVRegs)) 392 return; 393 394 if (TII->hasAnyModifiersSet(MI)) 395 return; 396 397 const unsigned Opcode = MI.getOpcode(); 398 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 399 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 400 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 401 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 402 403 bool Swap; 404 405 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 406 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 407 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 408 Swap = false; 409 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 410 Swap = true; 411 else 412 return; 413 414 switch (Opcode) { 415 default: 416 llvm_unreachable("Unexpected mad/fma opcode!"); 417 case AMDGPU::V_MAD_F32_e64: 418 NewOpcode = AMDGPU::V_MADAK_F32; 419 break; 420 case AMDGPU::V_FMA_F32_e64: 421 NewOpcode = AMDGPU::V_FMAAK_F32; 422 break; 423 case AMDGPU::V_MAD_F16_e64: 424 NewOpcode = AMDGPU::V_MADAK_F16; 425 break; 426 case AMDGPU::V_FMA_F16_e64: 427 case AMDGPU::V_FMA_F16_gfx9_e64: 428 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 429 : AMDGPU::V_FMAAK_F16; 430 break; 431 } 432 } 433 434 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 435 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 436 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 437 Swap = false; 438 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 439 Swap = true; 440 else 441 return; 442 443 switch (Opcode) { 444 default: 445 llvm_unreachable("Unexpected mad/fma opcode!"); 446 case AMDGPU::V_MAD_F32_e64: 447 NewOpcode = AMDGPU::V_MADMK_F32; 448 break; 449 case AMDGPU::V_FMA_F32_e64: 450 NewOpcode = AMDGPU::V_FMAMK_F32; 451 break; 452 case AMDGPU::V_MAD_F16_e64: 453 NewOpcode = AMDGPU::V_MADMK_F16; 454 break; 455 case AMDGPU::V_FMA_F16_e64: 456 case AMDGPU::V_FMA_F16_gfx9_e64: 457 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 458 : AMDGPU::V_FMAMK_F16; 459 break; 460 } 461 } 462 463 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 464 return; 465 466 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 467 return; 468 469 if (Swap) { 470 // Swap Src0 and Src1 by building a new instruction. 471 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 472 MI.getOperand(0).getReg()) 473 .add(Src1) 474 .add(Src0) 475 .add(Src2) 476 .setMIFlags(MI.getFlags()); 477 MI.eraseFromParent(); 478 } else { 479 TII->removeModOperands(MI); 480 MI.setDesc(TII->get(NewOpcode)); 481 } 482 } 483 484 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. 485 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 486 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 487 /// XNOR (as a ^ b == ~(a ^ ~b)). 488 /// \returns true if the caller should continue the machine function iterator 489 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 490 unsigned Opc = MI.getOpcode(); 491 const MachineOperand *Dest = &MI.getOperand(0); 492 MachineOperand *Src0 = &MI.getOperand(1); 493 MachineOperand *Src1 = &MI.getOperand(2); 494 MachineOperand *SrcReg = Src0; 495 MachineOperand *SrcImm = Src1; 496 497 if (!SrcImm->isImm() || 498 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 499 return false; 500 501 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 502 uint32_t NewImm = 0; 503 504 if (Opc == AMDGPU::S_AND_B32) { 505 if (isPowerOf2_32(~Imm)) { 506 NewImm = llvm::countr_one(Imm); 507 Opc = AMDGPU::S_BITSET0_B32; 508 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 509 NewImm = ~Imm; 510 Opc = AMDGPU::S_ANDN2_B32; 511 } 512 } else if (Opc == AMDGPU::S_OR_B32) { 513 if (isPowerOf2_32(Imm)) { 514 NewImm = llvm::countr_zero(Imm); 515 Opc = AMDGPU::S_BITSET1_B32; 516 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 517 NewImm = ~Imm; 518 Opc = AMDGPU::S_ORN2_B32; 519 } 520 } else if (Opc == AMDGPU::S_XOR_B32) { 521 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 522 NewImm = ~Imm; 523 Opc = AMDGPU::S_XNOR_B32; 524 } 525 } else { 526 llvm_unreachable("unexpected opcode"); 527 } 528 529 if (NewImm != 0) { 530 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 531 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 532 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 533 return true; 534 } 535 536 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 537 const bool IsUndef = SrcReg->isUndef(); 538 const bool IsKill = SrcReg->isKill(); 539 MI.setDesc(TII->get(Opc)); 540 if (Opc == AMDGPU::S_BITSET0_B32 || 541 Opc == AMDGPU::S_BITSET1_B32) { 542 Src0->ChangeToImmediate(NewImm); 543 // Remove the immediate and add the tied input. 544 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 545 /*isImp*/ false, IsKill, 546 /*isDead*/ false, IsUndef); 547 MI.tieOperands(0, 2); 548 } else { 549 SrcImm->setImm(NewImm); 550 } 551 } 552 } 553 554 return false; 555 } 556 557 // This is the same as MachineInstr::readsRegister/modifiesRegister except 558 // it takes subregs into account. 559 bool SIShrinkInstructions::instAccessReg( 560 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 561 unsigned SubReg) const { 562 for (const MachineOperand &MO : R) { 563 if (!MO.isReg()) 564 continue; 565 566 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 567 if (TRI->regsOverlap(Reg, MO.getReg())) 568 return true; 569 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 570 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 571 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 572 if (Overlap.any()) 573 return true; 574 } 575 } 576 return false; 577 } 578 579 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 580 unsigned SubReg) const { 581 return instAccessReg(MI->uses(), Reg, SubReg); 582 } 583 584 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 585 unsigned SubReg) const { 586 return instAccessReg(MI->defs(), Reg, SubReg); 587 } 588 589 TargetInstrInfo::RegSubRegPair 590 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 591 unsigned I) const { 592 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 593 if (Reg.isPhysical()) { 594 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 595 } else { 596 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 597 } 598 } 599 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 600 } 601 602 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 603 MachineInstr &MI) const { 604 for (unsigned i = MI.getDesc().getNumOperands() + 605 MI.getDesc().implicit_uses().size() + 606 MI.getDesc().implicit_defs().size(), 607 e = MI.getNumOperands(); 608 i != e; ++i) { 609 const MachineOperand &Op = MI.getOperand(i); 610 if (!Op.isDef()) 611 continue; 612 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 613 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 614 } 615 616 MI.eraseFromParent(); 617 } 618 619 // Match: 620 // mov t, x 621 // mov x, y 622 // mov y, t 623 // 624 // => 625 // 626 // mov t, x (t is potentially dead and move eliminated) 627 // v_swap_b32 x, y 628 // 629 // Returns next valid instruction pointer if was able to create v_swap_b32. 630 // 631 // This shall not be done too early not to prevent possible folding which may 632 // remove matched moves, and this should preferably be done before RA to 633 // release saved registers and also possibly after RA which can insert copies 634 // too. 635 // 636 // This is really just a generic peephole that is not a canonical shrinking, 637 // although requirements match the pass placement and it reduces code size too. 638 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 639 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 640 MovT.getOpcode() == AMDGPU::COPY); 641 642 Register T = MovT.getOperand(0).getReg(); 643 unsigned Tsub = MovT.getOperand(0).getSubReg(); 644 MachineOperand &Xop = MovT.getOperand(1); 645 646 if (!Xop.isReg()) 647 return nullptr; 648 Register X = Xop.getReg(); 649 unsigned Xsub = Xop.getSubReg(); 650 651 unsigned Size = TII->getOpSize(MovT, 0) / 4; 652 653 if (!TRI->isVGPR(*MRI, X)) 654 return nullptr; 655 656 const unsigned SearchLimit = 16; 657 unsigned Count = 0; 658 bool KilledT = false; 659 for (auto Iter = std::next(MovT.getIterator()), 660 E = MovT.getParent()->instr_end(); 661 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 662 663 MachineInstr *MovY = &*Iter; 664 KilledT = MovY->killsRegister(T, TRI); 665 666 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 667 MovY->getOpcode() != AMDGPU::COPY) || 668 !MovY->getOperand(1).isReg() || 669 MovY->getOperand(1).getReg() != T || 670 MovY->getOperand(1).getSubReg() != Tsub) 671 continue; 672 673 Register Y = MovY->getOperand(0).getReg(); 674 unsigned Ysub = MovY->getOperand(0).getSubReg(); 675 676 if (!TRI->isVGPR(*MRI, Y)) 677 continue; 678 679 MachineInstr *MovX = nullptr; 680 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 681 I != IY; ++I) { 682 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 683 instModifiesReg(&*I, T, Tsub) || 684 (MovX && instModifiesReg(&*I, X, Xsub))) { 685 MovX = nullptr; 686 break; 687 } 688 if (!instReadsReg(&*I, Y, Ysub)) { 689 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 690 MovX = nullptr; 691 break; 692 } 693 continue; 694 } 695 if (MovX || 696 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 697 I->getOpcode() != AMDGPU::COPY) || 698 I->getOperand(0).getReg() != X || 699 I->getOperand(0).getSubReg() != Xsub) { 700 MovX = nullptr; 701 break; 702 } 703 704 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 705 continue; 706 707 MovX = &*I; 708 } 709 710 if (!MovX) 711 continue; 712 713 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 714 715 for (unsigned I = 0; I < Size; ++I) { 716 TargetInstrInfo::RegSubRegPair X1, Y1; 717 X1 = getSubRegForIndex(X, Xsub, I); 718 Y1 = getSubRegForIndex(Y, Ysub, I); 719 MachineBasicBlock &MBB = *MovT.getParent(); 720 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 721 TII->get(AMDGPU::V_SWAP_B32)) 722 .addDef(X1.Reg, 0, X1.SubReg) 723 .addDef(Y1.Reg, 0, Y1.SubReg) 724 .addReg(Y1.Reg, 0, Y1.SubReg) 725 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 726 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 727 // Drop implicit EXEC. 728 MIB->removeOperand(MIB->getNumExplicitOperands()); 729 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 730 } 731 } 732 MovX->eraseFromParent(); 733 dropInstructionKeepingImpDefs(*MovY); 734 MachineInstr *Next = &*std::next(MovT.getIterator()); 735 736 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 737 dropInstructionKeepingImpDefs(MovT); 738 } else { 739 Xop.setIsKill(false); 740 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 741 unsigned OpNo = MovT.getNumExplicitOperands() + I; 742 const MachineOperand &Op = MovT.getOperand(OpNo); 743 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 744 MovT.removeOperand(OpNo); 745 } 746 } 747 748 return Next; 749 } 750 751 return nullptr; 752 } 753 754 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 755 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 756 if (!ST->hasGFX10_3Insts()) 757 return false; 758 759 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 760 if (!Op) 761 return false; 762 Register SDstReg = Op->getReg(); 763 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 764 return false; 765 766 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 767 return true; 768 } 769 770 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 771 if (skipFunction(MF.getFunction())) 772 return false; 773 774 this->MF = &MF; 775 MRI = &MF.getRegInfo(); 776 ST = &MF.getSubtarget<GCNSubtarget>(); 777 TII = ST->getInstrInfo(); 778 TRI = &TII->getRegisterInfo(); 779 780 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 781 782 std::vector<unsigned> I1Defs; 783 784 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 785 BI != BE; ++BI) { 786 787 MachineBasicBlock &MBB = *BI; 788 MachineBasicBlock::iterator I, Next; 789 for (I = MBB.begin(); I != MBB.end(); I = Next) { 790 Next = std::next(I); 791 MachineInstr &MI = *I; 792 793 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 794 // If this has a literal constant source that is the same as the 795 // reversed bits of an inline immediate, replace with a bitreverse of 796 // that constant. This saves 4 bytes in the common case of materializing 797 // sign bits. 798 799 // Test if we are after regalloc. We only want to do this after any 800 // optimizations happen because this will confuse them. 801 // XXX - not exactly a check for post-regalloc run. 802 MachineOperand &Src = MI.getOperand(1); 803 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 804 int32_t ReverseImm; 805 if (isReverseInlineImm(Src, ReverseImm)) { 806 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 807 Src.setImm(ReverseImm); 808 continue; 809 } 810 } 811 } 812 813 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 814 MI.getOpcode() == AMDGPU::COPY)) { 815 if (auto *NextMI = matchSwap(MI)) { 816 Next = NextMI->getIterator(); 817 continue; 818 } 819 } 820 821 // Try to use S_ADDK_I32 and S_MULK_I32. 822 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 823 MI.getOpcode() == AMDGPU::S_MUL_I32) { 824 const MachineOperand *Dest = &MI.getOperand(0); 825 MachineOperand *Src0 = &MI.getOperand(1); 826 MachineOperand *Src1 = &MI.getOperand(2); 827 828 if (!Src0->isReg() && Src1->isReg()) { 829 if (TII->commuteInstruction(MI, false, 1, 2)) 830 std::swap(Src0, Src1); 831 } 832 833 // FIXME: This could work better if hints worked with subregisters. If 834 // we have a vector add of a constant, we usually don't get the correct 835 // allocation due to the subregister usage. 836 if (Dest->getReg().isVirtual() && Src0->isReg()) { 837 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 838 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 839 continue; 840 } 841 842 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 843 if (Src1->isImm() && isKImmOperand(*Src1)) { 844 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 845 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 846 847 Src1->setImm(SignExtend64(Src1->getImm(), 32)); 848 MI.setDesc(TII->get(Opc)); 849 MI.tieOperands(0, 1); 850 } 851 } 852 } 853 854 // Try to use s_cmpk_* 855 if (MI.isCompare() && TII->isSOPC(MI)) { 856 shrinkScalarCompare(MI); 857 continue; 858 } 859 860 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 861 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 862 const MachineOperand &Dst = MI.getOperand(0); 863 MachineOperand &Src = MI.getOperand(1); 864 865 if (Src.isImm() && Dst.getReg().isPhysical()) { 866 int32_t ReverseImm; 867 if (isKImmOperand(Src)) { 868 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 869 Src.setImm(SignExtend64(Src.getImm(), 32)); 870 } else if (isReverseInlineImm(Src, ReverseImm)) { 871 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 872 Src.setImm(ReverseImm); 873 } 874 } 875 876 continue; 877 } 878 879 // Shrink scalar logic operations. 880 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 881 MI.getOpcode() == AMDGPU::S_OR_B32 || 882 MI.getOpcode() == AMDGPU::S_XOR_B32) { 883 if (shrinkScalarLogicOp(MI)) 884 continue; 885 } 886 887 if (TII->isMIMG(MI.getOpcode()) && 888 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 889 MF.getProperties().hasProperty( 890 MachineFunctionProperties::Property::NoVRegs)) { 891 shrinkMIMG(MI); 892 continue; 893 } 894 895 if (!TII->isVOP3(MI)) 896 continue; 897 898 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 899 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 900 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 901 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 902 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { 903 shrinkMadFma(MI); 904 continue; 905 } 906 907 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 908 // If there is no chance we will shrink it and use VCC as sdst to get 909 // a 32 bit form try to replace dead sdst with NULL. 910 tryReplaceDeadSDST(MI); 911 continue; 912 } 913 914 if (!TII->canShrink(MI, *MRI)) { 915 // Try commuting the instruction and see if that enables us to shrink 916 // it. 917 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 918 !TII->canShrink(MI, *MRI)) { 919 tryReplaceDeadSDST(MI); 920 continue; 921 } 922 } 923 924 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 925 926 if (TII->isVOPC(Op32)) { 927 MachineOperand &Op0 = MI.getOperand(0); 928 if (Op0.isReg()) { 929 // Exclude VOPCX instructions as these don't explicitly write a 930 // dst. 931 Register DstReg = Op0.getReg(); 932 if (DstReg.isVirtual()) { 933 // VOPC instructions can only write to the VCC register. We can't 934 // force them to use VCC here, because this is only one register and 935 // cannot deal with sequences which would require multiple copies of 936 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 937 // 938 // So, instead of forcing the instruction to write to VCC, we 939 // provide a hint to the register allocator to use VCC and then we 940 // will run this pass again after RA and shrink it if it outputs to 941 // VCC. 942 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 943 continue; 944 } 945 if (DstReg != VCCReg) 946 continue; 947 } 948 } 949 950 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 951 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 952 // instructions. 953 const MachineOperand *Src2 = 954 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 955 if (!Src2->isReg()) 956 continue; 957 Register SReg = Src2->getReg(); 958 if (SReg.isVirtual()) { 959 MRI->setRegAllocationHint(SReg, 0, VCCReg); 960 continue; 961 } 962 if (SReg != VCCReg) 963 continue; 964 } 965 966 // Check for the bool flag output for instructions like V_ADD_I32_e64. 967 const MachineOperand *SDst = TII->getNamedOperand(MI, 968 AMDGPU::OpName::sdst); 969 970 if (SDst) { 971 bool Next = false; 972 973 if (SDst->getReg() != VCCReg) { 974 if (SDst->getReg().isVirtual()) 975 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 976 Next = true; 977 } 978 979 // All of the instructions with carry outs also have an SGPR input in 980 // src2. 981 const MachineOperand *Src2 = TII->getNamedOperand(MI, 982 AMDGPU::OpName::src2); 983 if (Src2 && Src2->getReg() != VCCReg) { 984 if (Src2->getReg().isVirtual()) 985 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 986 Next = true; 987 } 988 989 if (Next) 990 continue; 991 } 992 993 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 994 // fold an immediate into the shrunk instruction as a literal operand. In 995 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 996 // no advantage to doing this. 997 if (ST->hasVOP3Literal() && 998 !MF.getProperties().hasProperty( 999 MachineFunctionProperties::Property::NoVRegs)) 1000 continue; 1001 1002 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 1003 !shouldShrinkTrue16(MI)) 1004 continue; 1005 1006 // We can shrink this instruction 1007 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1008 1009 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1010 ++NumInstructionsShrunk; 1011 1012 // Copy extra operands not present in the instruction definition. 1013 copyExtraImplicitOps(*Inst32, MI); 1014 1015 // Copy deadness from the old explicit vcc def to the new implicit def. 1016 if (SDst && SDst->isDead()) 1017 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 1018 1019 MI.eraseFromParent(); 1020 foldImmediates(*Inst32); 1021 1022 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1023 } 1024 } 1025 return false; 1026 } 1027