1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "Utils/AMDGPUBaseInfo.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 18 #define DEBUG_TYPE "si-shrink-instructions" 19 20 STATISTIC(NumInstructionsShrunk, 21 "Number of 64-bit instruction reduced to 32-bit."); 22 STATISTIC(NumLiteralConstantsFolded, 23 "Number of literal constants folded into 32-bit instructions."); 24 25 using namespace llvm; 26 27 namespace { 28 29 class SIShrinkInstructions : public MachineFunctionPass { 30 MachineFunction *MF; 31 MachineRegisterInfo *MRI; 32 const GCNSubtarget *ST; 33 const SIInstrInfo *TII; 34 const SIRegisterInfo *TRI; 35 36 public: 37 static char ID; 38 39 public: 40 SIShrinkInstructions() : MachineFunctionPass(ID) { 41 } 42 43 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 44 bool shouldShrinkTrue16(MachineInstr &MI) const; 45 bool isKImmOperand(const MachineOperand &Src) const; 46 bool isKUImmOperand(const MachineOperand &Src) const; 47 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 48 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 49 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 50 void shrinkScalarCompare(MachineInstr &MI) const; 51 void shrinkMIMG(MachineInstr &MI) const; 52 void shrinkMadFma(MachineInstr &MI) const; 53 bool shrinkScalarLogicOp(MachineInstr &MI) const; 54 bool tryReplaceDeadSDST(MachineInstr &MI) const; 55 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 56 Register Reg, unsigned SubReg) const; 57 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 58 unsigned SubReg) const; 59 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 60 unsigned SubReg) const; 61 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 62 unsigned I) const; 63 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 64 MachineInstr *matchSwap(MachineInstr &MovT) const; 65 66 bool runOnMachineFunction(MachineFunction &MF) override; 67 68 StringRef getPassName() const override { return "SI Shrink Instructions"; } 69 70 void getAnalysisUsage(AnalysisUsage &AU) const override { 71 AU.setPreservesCFG(); 72 MachineFunctionPass::getAnalysisUsage(AU); 73 } 74 }; 75 76 } // End anonymous namespace. 77 78 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 79 "SI Shrink Instructions", false, false) 80 81 char SIShrinkInstructions::ID = 0; 82 83 FunctionPass *llvm::createSIShrinkInstructionsPass() { 84 return new SIShrinkInstructions(); 85 } 86 87 /// This function checks \p MI for operands defined by a move immediate 88 /// instruction and then folds the literal constant into the instruction if it 89 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 90 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 91 bool TryToCommute) const { 92 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 93 94 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 95 96 // Try to fold Src0 97 MachineOperand &Src0 = MI.getOperand(Src0Idx); 98 if (Src0.isReg()) { 99 Register Reg = Src0.getReg(); 100 if (Reg.isVirtual()) { 101 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 102 if (Def && Def->isMoveImmediate()) { 103 MachineOperand &MovSrc = Def->getOperand(1); 104 bool ConstantFolded = false; 105 106 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 107 if (MovSrc.isImm() && 108 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 109 Src0.ChangeToImmediate(MovSrc.getImm()); 110 ConstantFolded = true; 111 } else if (MovSrc.isFI()) { 112 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 113 ConstantFolded = true; 114 } else if (MovSrc.isGlobal()) { 115 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 116 MovSrc.getTargetFlags()); 117 ConstantFolded = true; 118 } 119 } 120 121 if (ConstantFolded) { 122 if (MRI->use_nodbg_empty(Reg)) 123 Def->eraseFromParent(); 124 ++NumLiteralConstantsFolded; 125 return true; 126 } 127 } 128 } 129 } 130 131 // We have failed to fold src0, so commute the instruction and try again. 132 if (TryToCommute && MI.isCommutable()) { 133 if (TII->commuteInstruction(MI)) { 134 if (foldImmediates(MI, false)) 135 return true; 136 137 // Commute back. 138 TII->commuteInstruction(MI); 139 } 140 } 141 142 return false; 143 } 144 145 /// Do not shrink the instruction if its registers are not expressible in the 146 /// shrunk encoding. 147 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 148 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 149 const MachineOperand &MO = MI.getOperand(I); 150 if (MO.isReg()) { 151 Register Reg = MO.getReg(); 152 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 153 "True16 Instructions post-RA"); 154 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 155 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 156 return false; 157 } 158 } 159 return true; 160 } 161 162 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 163 return isInt<16>(Src.getImm()) && 164 !TII->isInlineConstant(*Src.getParent(), 165 Src.getParent()->getOperandNo(&Src)); 166 } 167 168 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 169 return isUInt<16>(Src.getImm()) && 170 !TII->isInlineConstant(*Src.getParent(), 171 Src.getParent()->getOperandNo(&Src)); 172 } 173 174 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 175 bool &IsUnsigned) const { 176 if (isInt<16>(Src.getImm())) { 177 IsUnsigned = false; 178 return !TII->isInlineConstant(Src); 179 } 180 181 if (isUInt<16>(Src.getImm())) { 182 IsUnsigned = true; 183 return !TII->isInlineConstant(Src); 184 } 185 186 return false; 187 } 188 189 /// \returns true if the constant in \p Src should be replaced with a bitreverse 190 /// of an inline immediate. 191 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 192 int32_t &ReverseImm) const { 193 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 194 return false; 195 196 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 197 return ReverseImm >= -16 && ReverseImm <= 64; 198 } 199 200 /// Copy implicit register operands from specified instruction to this 201 /// instruction that are not part of the instruction definition. 202 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 203 MachineInstr &MI) const { 204 MachineFunction &MF = *MI.getMF(); 205 for (unsigned i = MI.getDesc().getNumOperands() + 206 MI.getDesc().implicit_uses().size() + 207 MI.getDesc().implicit_defs().size(), 208 e = MI.getNumOperands(); 209 i != e; ++i) { 210 const MachineOperand &MO = MI.getOperand(i); 211 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 212 NewMI.addOperand(MF, MO); 213 } 214 } 215 216 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 217 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 218 // get constants on the RHS. 219 if (!MI.getOperand(0).isReg()) 220 TII->commuteInstruction(MI, false, 0, 1); 221 222 // cmpk requires src0 to be a register 223 const MachineOperand &Src0 = MI.getOperand(0); 224 if (!Src0.isReg()) 225 return; 226 227 const MachineOperand &Src1 = MI.getOperand(1); 228 if (!Src1.isImm()) 229 return; 230 231 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 232 if (SOPKOpc == -1) 233 return; 234 235 // eq/ne is special because the imm16 can be treated as signed or unsigned, 236 // and initially selected to the unsigned versions. 237 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 238 bool HasUImm; 239 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 240 if (!HasUImm) { 241 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 242 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 243 } 244 245 MI.setDesc(TII->get(SOPKOpc)); 246 } 247 248 return; 249 } 250 251 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 252 253 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 254 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 255 MI.setDesc(NewDesc); 256 } 257 } 258 259 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 260 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 261 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 262 if (!Info) 263 return; 264 265 uint8_t NewEncoding; 266 switch (Info->MIMGEncoding) { 267 case AMDGPU::MIMGEncGfx10NSA: 268 NewEncoding = AMDGPU::MIMGEncGfx10Default; 269 break; 270 case AMDGPU::MIMGEncGfx11NSA: 271 NewEncoding = AMDGPU::MIMGEncGfx11Default; 272 break; 273 default: 274 return; 275 } 276 277 int VAddr0Idx = 278 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 279 unsigned NewAddrDwords = Info->VAddrDwords; 280 const TargetRegisterClass *RC; 281 282 if (Info->VAddrDwords == 2) { 283 RC = &AMDGPU::VReg_64RegClass; 284 } else if (Info->VAddrDwords == 3) { 285 RC = &AMDGPU::VReg_96RegClass; 286 } else if (Info->VAddrDwords == 4) { 287 RC = &AMDGPU::VReg_128RegClass; 288 } else if (Info->VAddrDwords == 5) { 289 RC = &AMDGPU::VReg_160RegClass; 290 } else if (Info->VAddrDwords == 6) { 291 RC = &AMDGPU::VReg_192RegClass; 292 } else if (Info->VAddrDwords == 7) { 293 RC = &AMDGPU::VReg_224RegClass; 294 } else if (Info->VAddrDwords == 8) { 295 RC = &AMDGPU::VReg_256RegClass; 296 } else if (Info->VAddrDwords == 9) { 297 RC = &AMDGPU::VReg_288RegClass; 298 } else if (Info->VAddrDwords == 10) { 299 RC = &AMDGPU::VReg_320RegClass; 300 } else if (Info->VAddrDwords == 11) { 301 RC = &AMDGPU::VReg_352RegClass; 302 } else if (Info->VAddrDwords == 12) { 303 RC = &AMDGPU::VReg_384RegClass; 304 } else { 305 RC = &AMDGPU::VReg_512RegClass; 306 NewAddrDwords = 16; 307 } 308 309 unsigned VgprBase = 0; 310 unsigned NextVgpr = 0; 311 bool IsUndef = true; 312 bool IsKill = NewAddrDwords == Info->VAddrDwords; 313 for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { 314 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 315 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 316 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 317 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 318 319 if (Idx == 0) { 320 VgprBase = Vgpr; 321 NextVgpr = Vgpr + Dwords; 322 } else if (Vgpr == NextVgpr) { 323 NextVgpr = Vgpr + Dwords; 324 } else { 325 return; 326 } 327 328 if (!Op.isUndef()) 329 IsUndef = false; 330 if (!Op.isKill()) 331 IsKill = false; 332 } 333 334 if (VgprBase + NewAddrDwords > 256) 335 return; 336 337 // Further check for implicit tied operands - this may be present if TFE is 338 // enabled 339 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 340 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 341 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 342 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 343 int ToUntie = -1; 344 if (TFEVal || LWEVal) { 345 // TFE/LWE is enabled so we need to deal with an implicit tied operand 346 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 347 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 348 MI.getOperand(i).isImplicit()) { 349 // This is the tied operand 350 assert( 351 ToUntie == -1 && 352 "found more than one tied implicit operand when expecting only 1"); 353 ToUntie = i; 354 MI.untieRegOperand(ToUntie); 355 } 356 } 357 } 358 359 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 360 Info->VDataDwords, NewAddrDwords); 361 MI.setDesc(TII->get(NewOpcode)); 362 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 363 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 364 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 365 366 for (int i = 1; i < Info->VAddrOperands; ++i) 367 MI.removeOperand(VAddr0Idx + 1); 368 369 if (ToUntie >= 0) { 370 MI.tieOperands( 371 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 372 ToUntie - (Info->VAddrOperands - 1)); 373 } 374 } 375 376 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 377 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 378 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 379 // there is no reason to try to shrink them. 380 if (!ST->hasVOP3Literal()) 381 return; 382 383 // There is no advantage to doing this pre-RA. 384 if (!MF->getProperties().hasProperty( 385 MachineFunctionProperties::Property::NoVRegs)) 386 return; 387 388 if (TII->hasAnyModifiersSet(MI)) 389 return; 390 391 const unsigned Opcode = MI.getOpcode(); 392 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 393 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 394 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 395 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 396 397 bool Swap; 398 399 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 400 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 401 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 402 Swap = false; 403 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 404 Swap = true; 405 else 406 return; 407 408 switch (Opcode) { 409 default: 410 llvm_unreachable("Unexpected mad/fma opcode!"); 411 case AMDGPU::V_MAD_F32_e64: 412 NewOpcode = AMDGPU::V_MADAK_F32; 413 break; 414 case AMDGPU::V_FMA_F32_e64: 415 NewOpcode = AMDGPU::V_FMAAK_F32; 416 break; 417 case AMDGPU::V_MAD_F16_e64: 418 NewOpcode = AMDGPU::V_MADAK_F16; 419 break; 420 case AMDGPU::V_FMA_F16_e64: 421 case AMDGPU::V_FMA_F16_gfx9_e64: 422 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 423 : AMDGPU::V_FMAAK_F16; 424 break; 425 } 426 } 427 428 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 429 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 430 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 431 Swap = false; 432 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 433 Swap = true; 434 else 435 return; 436 437 switch (Opcode) { 438 default: 439 llvm_unreachable("Unexpected mad/fma opcode!"); 440 case AMDGPU::V_MAD_F32_e64: 441 NewOpcode = AMDGPU::V_MADMK_F32; 442 break; 443 case AMDGPU::V_FMA_F32_e64: 444 NewOpcode = AMDGPU::V_FMAMK_F32; 445 break; 446 case AMDGPU::V_MAD_F16_e64: 447 NewOpcode = AMDGPU::V_MADMK_F16; 448 break; 449 case AMDGPU::V_FMA_F16_e64: 450 case AMDGPU::V_FMA_F16_gfx9_e64: 451 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 452 : AMDGPU::V_FMAMK_F16; 453 break; 454 } 455 } 456 457 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 458 return; 459 460 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 461 return; 462 463 if (Swap) { 464 // Swap Src0 and Src1 by building a new instruction. 465 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 466 MI.getOperand(0).getReg()) 467 .add(Src1) 468 .add(Src0) 469 .add(Src2) 470 .setMIFlags(MI.getFlags()); 471 MI.eraseFromParent(); 472 } else { 473 TII->removeModOperands(MI); 474 MI.setDesc(TII->get(NewOpcode)); 475 } 476 } 477 478 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 479 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 480 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 481 /// XNOR (as a ^ b == ~(a ^ ~b)). 482 /// \returns true if the caller should continue the machine function iterator 483 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 484 unsigned Opc = MI.getOpcode(); 485 const MachineOperand *Dest = &MI.getOperand(0); 486 MachineOperand *Src0 = &MI.getOperand(1); 487 MachineOperand *Src1 = &MI.getOperand(2); 488 MachineOperand *SrcReg = Src0; 489 MachineOperand *SrcImm = Src1; 490 491 if (!SrcImm->isImm() || 492 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 493 return false; 494 495 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 496 uint32_t NewImm = 0; 497 498 if (Opc == AMDGPU::S_AND_B32) { 499 if (isPowerOf2_32(~Imm)) { 500 NewImm = countTrailingOnes(Imm); 501 Opc = AMDGPU::S_BITSET0_B32; 502 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 503 NewImm = ~Imm; 504 Opc = AMDGPU::S_ANDN2_B32; 505 } 506 } else if (Opc == AMDGPU::S_OR_B32) { 507 if (isPowerOf2_32(Imm)) { 508 NewImm = countTrailingZeros(Imm); 509 Opc = AMDGPU::S_BITSET1_B32; 510 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 511 NewImm = ~Imm; 512 Opc = AMDGPU::S_ORN2_B32; 513 } 514 } else if (Opc == AMDGPU::S_XOR_B32) { 515 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 516 NewImm = ~Imm; 517 Opc = AMDGPU::S_XNOR_B32; 518 } 519 } else { 520 llvm_unreachable("unexpected opcode"); 521 } 522 523 if (NewImm != 0) { 524 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 525 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 526 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 527 return true; 528 } 529 530 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 531 const bool IsUndef = SrcReg->isUndef(); 532 const bool IsKill = SrcReg->isKill(); 533 MI.setDesc(TII->get(Opc)); 534 if (Opc == AMDGPU::S_BITSET0_B32 || 535 Opc == AMDGPU::S_BITSET1_B32) { 536 Src0->ChangeToImmediate(NewImm); 537 // Remove the immediate and add the tied input. 538 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 539 /*isImp*/ false, IsKill, 540 /*isDead*/ false, IsUndef); 541 MI.tieOperands(0, 2); 542 } else { 543 SrcImm->setImm(NewImm); 544 } 545 } 546 } 547 548 return false; 549 } 550 551 // This is the same as MachineInstr::readsRegister/modifiesRegister except 552 // it takes subregs into account. 553 bool SIShrinkInstructions::instAccessReg( 554 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 555 unsigned SubReg) const { 556 for (const MachineOperand &MO : R) { 557 if (!MO.isReg()) 558 continue; 559 560 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 561 if (TRI->regsOverlap(Reg, MO.getReg())) 562 return true; 563 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 564 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 565 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 566 if (Overlap.any()) 567 return true; 568 } 569 } 570 return false; 571 } 572 573 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 574 unsigned SubReg) const { 575 return instAccessReg(MI->uses(), Reg, SubReg); 576 } 577 578 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 579 unsigned SubReg) const { 580 return instAccessReg(MI->defs(), Reg, SubReg); 581 } 582 583 TargetInstrInfo::RegSubRegPair 584 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 585 unsigned I) const { 586 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 587 if (Reg.isPhysical()) { 588 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 589 } else { 590 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 591 } 592 } 593 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 594 } 595 596 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 597 MachineInstr &MI) const { 598 for (unsigned i = MI.getDesc().getNumOperands() + 599 MI.getDesc().implicit_uses().size() + 600 MI.getDesc().implicit_defs().size(), 601 e = MI.getNumOperands(); 602 i != e; ++i) { 603 const MachineOperand &Op = MI.getOperand(i); 604 if (!Op.isDef()) 605 continue; 606 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 607 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 608 } 609 610 MI.eraseFromParent(); 611 } 612 613 // Match: 614 // mov t, x 615 // mov x, y 616 // mov y, t 617 // 618 // => 619 // 620 // mov t, x (t is potentially dead and move eliminated) 621 // v_swap_b32 x, y 622 // 623 // Returns next valid instruction pointer if was able to create v_swap_b32. 624 // 625 // This shall not be done too early not to prevent possible folding which may 626 // remove matched moves, and this should preferably be done before RA to 627 // release saved registers and also possibly after RA which can insert copies 628 // too. 629 // 630 // This is really just a generic peephole that is not a canonical shrinking, 631 // although requirements match the pass placement and it reduces code size too. 632 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 633 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 634 MovT.getOpcode() == AMDGPU::COPY); 635 636 Register T = MovT.getOperand(0).getReg(); 637 unsigned Tsub = MovT.getOperand(0).getSubReg(); 638 MachineOperand &Xop = MovT.getOperand(1); 639 640 if (!Xop.isReg()) 641 return nullptr; 642 Register X = Xop.getReg(); 643 unsigned Xsub = Xop.getSubReg(); 644 645 unsigned Size = TII->getOpSize(MovT, 0) / 4; 646 647 if (!TRI->isVGPR(*MRI, X)) 648 return nullptr; 649 650 const unsigned SearchLimit = 16; 651 unsigned Count = 0; 652 bool KilledT = false; 653 for (auto Iter = std::next(MovT.getIterator()), 654 E = MovT.getParent()->instr_end(); 655 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 656 657 MachineInstr *MovY = &*Iter; 658 KilledT = MovY->killsRegister(T, TRI); 659 660 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 661 MovY->getOpcode() != AMDGPU::COPY) || 662 !MovY->getOperand(1).isReg() || 663 MovY->getOperand(1).getReg() != T || 664 MovY->getOperand(1).getSubReg() != Tsub) 665 continue; 666 667 Register Y = MovY->getOperand(0).getReg(); 668 unsigned Ysub = MovY->getOperand(0).getSubReg(); 669 670 if (!TRI->isVGPR(*MRI, Y)) 671 continue; 672 673 MachineInstr *MovX = nullptr; 674 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 675 I != IY; ++I) { 676 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 677 instModifiesReg(&*I, T, Tsub) || 678 (MovX && instModifiesReg(&*I, X, Xsub))) { 679 MovX = nullptr; 680 break; 681 } 682 if (!instReadsReg(&*I, Y, Ysub)) { 683 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 684 MovX = nullptr; 685 break; 686 } 687 continue; 688 } 689 if (MovX || 690 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 691 I->getOpcode() != AMDGPU::COPY) || 692 I->getOperand(0).getReg() != X || 693 I->getOperand(0).getSubReg() != Xsub) { 694 MovX = nullptr; 695 break; 696 } 697 698 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 699 continue; 700 701 MovX = &*I; 702 } 703 704 if (!MovX) 705 continue; 706 707 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 708 709 for (unsigned I = 0; I < Size; ++I) { 710 TargetInstrInfo::RegSubRegPair X1, Y1; 711 X1 = getSubRegForIndex(X, Xsub, I); 712 Y1 = getSubRegForIndex(Y, Ysub, I); 713 MachineBasicBlock &MBB = *MovT.getParent(); 714 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 715 TII->get(AMDGPU::V_SWAP_B32)) 716 .addDef(X1.Reg, 0, X1.SubReg) 717 .addDef(Y1.Reg, 0, Y1.SubReg) 718 .addReg(Y1.Reg, 0, Y1.SubReg) 719 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 720 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 721 // Drop implicit EXEC. 722 MIB->removeOperand(MIB->getNumExplicitOperands()); 723 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 724 } 725 } 726 MovX->eraseFromParent(); 727 dropInstructionKeepingImpDefs(*MovY); 728 MachineInstr *Next = &*std::next(MovT.getIterator()); 729 730 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 731 dropInstructionKeepingImpDefs(MovT); 732 } else { 733 Xop.setIsKill(false); 734 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 735 unsigned OpNo = MovT.getNumExplicitOperands() + I; 736 const MachineOperand &Op = MovT.getOperand(OpNo); 737 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 738 MovT.removeOperand(OpNo); 739 } 740 } 741 742 return Next; 743 } 744 745 return nullptr; 746 } 747 748 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 749 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 750 if (!ST->hasGFX10_3Insts()) 751 return false; 752 753 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 754 if (!Op) 755 return false; 756 Register SDstReg = Op->getReg(); 757 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 758 return false; 759 760 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 761 return true; 762 } 763 764 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 765 if (skipFunction(MF.getFunction())) 766 return false; 767 768 this->MF = &MF; 769 MRI = &MF.getRegInfo(); 770 ST = &MF.getSubtarget<GCNSubtarget>(); 771 TII = ST->getInstrInfo(); 772 TRI = &TII->getRegisterInfo(); 773 774 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 775 776 std::vector<unsigned> I1Defs; 777 778 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 779 BI != BE; ++BI) { 780 781 MachineBasicBlock &MBB = *BI; 782 MachineBasicBlock::iterator I, Next; 783 for (I = MBB.begin(); I != MBB.end(); I = Next) { 784 Next = std::next(I); 785 MachineInstr &MI = *I; 786 787 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 788 // If this has a literal constant source that is the same as the 789 // reversed bits of an inline immediate, replace with a bitreverse of 790 // that constant. This saves 4 bytes in the common case of materializing 791 // sign bits. 792 793 // Test if we are after regalloc. We only want to do this after any 794 // optimizations happen because this will confuse them. 795 // XXX - not exactly a check for post-regalloc run. 796 MachineOperand &Src = MI.getOperand(1); 797 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 798 int32_t ReverseImm; 799 if (isReverseInlineImm(Src, ReverseImm)) { 800 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 801 Src.setImm(ReverseImm); 802 continue; 803 } 804 } 805 } 806 807 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 808 MI.getOpcode() == AMDGPU::COPY)) { 809 if (auto *NextMI = matchSwap(MI)) { 810 Next = NextMI->getIterator(); 811 continue; 812 } 813 } 814 815 // Try to use S_ADDK_I32 and S_MULK_I32. 816 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 817 MI.getOpcode() == AMDGPU::S_MUL_I32) { 818 const MachineOperand *Dest = &MI.getOperand(0); 819 MachineOperand *Src0 = &MI.getOperand(1); 820 MachineOperand *Src1 = &MI.getOperand(2); 821 822 if (!Src0->isReg() && Src1->isReg()) { 823 if (TII->commuteInstruction(MI, false, 1, 2)) 824 std::swap(Src0, Src1); 825 } 826 827 // FIXME: This could work better if hints worked with subregisters. If 828 // we have a vector add of a constant, we usually don't get the correct 829 // allocation due to the subregister usage. 830 if (Dest->getReg().isVirtual() && Src0->isReg()) { 831 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 832 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 833 continue; 834 } 835 836 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 837 if (Src1->isImm() && isKImmOperand(*Src1)) { 838 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 839 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 840 841 MI.setDesc(TII->get(Opc)); 842 MI.tieOperands(0, 1); 843 } 844 } 845 } 846 847 // Try to use s_cmpk_* 848 if (MI.isCompare() && TII->isSOPC(MI)) { 849 shrinkScalarCompare(MI); 850 continue; 851 } 852 853 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 854 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 855 const MachineOperand &Dst = MI.getOperand(0); 856 MachineOperand &Src = MI.getOperand(1); 857 858 if (Src.isImm() && Dst.getReg().isPhysical()) { 859 int32_t ReverseImm; 860 if (isKImmOperand(Src)) 861 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 862 else if (isReverseInlineImm(Src, ReverseImm)) { 863 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 864 Src.setImm(ReverseImm); 865 } 866 } 867 868 continue; 869 } 870 871 // Shrink scalar logic operations. 872 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 873 MI.getOpcode() == AMDGPU::S_OR_B32 || 874 MI.getOpcode() == AMDGPU::S_XOR_B32) { 875 if (shrinkScalarLogicOp(MI)) 876 continue; 877 } 878 879 if (TII->isMIMG(MI.getOpcode()) && 880 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 881 MF.getProperties().hasProperty( 882 MachineFunctionProperties::Property::NoVRegs)) { 883 shrinkMIMG(MI); 884 continue; 885 } 886 887 if (!TII->isVOP3(MI)) 888 continue; 889 890 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 891 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 892 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 893 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 894 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { 895 shrinkMadFma(MI); 896 continue; 897 } 898 899 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 900 // If there is no chance we will shrink it and use VCC as sdst to get 901 // a 32 bit form try to replace dead sdst with NULL. 902 tryReplaceDeadSDST(MI); 903 continue; 904 } 905 906 if (!TII->canShrink(MI, *MRI)) { 907 // Try commuting the instruction and see if that enables us to shrink 908 // it. 909 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 910 !TII->canShrink(MI, *MRI)) { 911 tryReplaceDeadSDST(MI); 912 continue; 913 } 914 } 915 916 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 917 918 if (TII->isVOPC(Op32)) { 919 MachineOperand &Op0 = MI.getOperand(0); 920 if (Op0.isReg()) { 921 // Exclude VOPCX instructions as these don't explicitly write a 922 // dst. 923 Register DstReg = Op0.getReg(); 924 if (DstReg.isVirtual()) { 925 // VOPC instructions can only write to the VCC register. We can't 926 // force them to use VCC here, because this is only one register and 927 // cannot deal with sequences which would require multiple copies of 928 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 929 // 930 // So, instead of forcing the instruction to write to VCC, we 931 // provide a hint to the register allocator to use VCC and then we 932 // will run this pass again after RA and shrink it if it outputs to 933 // VCC. 934 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 935 continue; 936 } 937 if (DstReg != VCCReg) 938 continue; 939 } 940 } 941 942 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 943 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 944 // instructions. 945 const MachineOperand *Src2 = 946 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 947 if (!Src2->isReg()) 948 continue; 949 Register SReg = Src2->getReg(); 950 if (SReg.isVirtual()) { 951 MRI->setRegAllocationHint(SReg, 0, VCCReg); 952 continue; 953 } 954 if (SReg != VCCReg) 955 continue; 956 } 957 958 // Check for the bool flag output for instructions like V_ADD_I32_e64. 959 const MachineOperand *SDst = TII->getNamedOperand(MI, 960 AMDGPU::OpName::sdst); 961 962 if (SDst) { 963 bool Next = false; 964 965 if (SDst->getReg() != VCCReg) { 966 if (SDst->getReg().isVirtual()) 967 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 968 Next = true; 969 } 970 971 // All of the instructions with carry outs also have an SGPR input in 972 // src2. 973 const MachineOperand *Src2 = TII->getNamedOperand(MI, 974 AMDGPU::OpName::src2); 975 if (Src2 && Src2->getReg() != VCCReg) { 976 if (Src2->getReg().isVirtual()) 977 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 978 Next = true; 979 } 980 981 if (Next) 982 continue; 983 } 984 985 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 986 // fold an immediate into the shrunk instruction as a literal operand. In 987 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 988 // no advantage to doing this. 989 if (ST->hasVOP3Literal() && 990 !MF.getProperties().hasProperty( 991 MachineFunctionProperties::Property::NoVRegs)) 992 continue; 993 994 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 995 !shouldShrinkTrue16(MI)) 996 continue; 997 998 // We can shrink this instruction 999 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1000 1001 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1002 ++NumInstructionsShrunk; 1003 1004 // Copy extra operands not present in the instruction definition. 1005 copyExtraImplicitOps(*Inst32, MI); 1006 1007 // Copy deadness from the old explicit vcc def to the new implicit def. 1008 if (SDst && SDst->isDead()) 1009 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 1010 1011 MI.eraseFromParent(); 1012 foldImmediates(*Inst32); 1013 1014 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1015 } 1016 } 1017 return false; 1018 } 1019