1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "SIShrinkInstructions.h" 12 #include "AMDGPU.h" 13 #include "GCNSubtarget.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "Utils/AMDGPUBaseInfo.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 19 #define DEBUG_TYPE "si-shrink-instructions" 20 21 STATISTIC(NumInstructionsShrunk, 22 "Number of 64-bit instruction reduced to 32-bit."); 23 STATISTIC(NumLiteralConstantsFolded, 24 "Number of literal constants folded into 32-bit instructions."); 25 26 using namespace llvm; 27 28 namespace { 29 30 class SIShrinkInstructions { 31 MachineFunction *MF; 32 MachineRegisterInfo *MRI; 33 const GCNSubtarget *ST; 34 const SIInstrInfo *TII; 35 const SIRegisterInfo *TRI; 36 bool IsPostRA; 37 38 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 39 bool shouldShrinkTrue16(MachineInstr &MI) const; 40 bool isKImmOperand(const MachineOperand &Src) const; 41 bool isKUImmOperand(const MachineOperand &Src) const; 42 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 43 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 44 void shrinkScalarCompare(MachineInstr &MI) const; 45 void shrinkMIMG(MachineInstr &MI) const; 46 void shrinkMadFma(MachineInstr &MI) const; 47 bool shrinkScalarLogicOp(MachineInstr &MI) const; 48 bool tryReplaceDeadSDST(MachineInstr &MI) const; 49 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 50 Register Reg, unsigned SubReg) const; 51 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 52 unsigned SubReg) const; 53 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 54 unsigned SubReg) const; 55 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 56 unsigned I) const; 57 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 58 MachineInstr *matchSwap(MachineInstr &MovT) const; 59 60 public: 61 SIShrinkInstructions() = default; 62 bool run(MachineFunction &MF); 63 }; 64 65 class SIShrinkInstructionsLegacy : public MachineFunctionPass { 66 67 public: 68 static char ID; 69 70 SIShrinkInstructionsLegacy() : MachineFunctionPass(ID) {} 71 72 bool runOnMachineFunction(MachineFunction &MF) override; 73 74 StringRef getPassName() const override { return "SI Shrink Instructions"; } 75 76 void getAnalysisUsage(AnalysisUsage &AU) const override { 77 AU.setPreservesCFG(); 78 MachineFunctionPass::getAnalysisUsage(AU); 79 } 80 }; 81 82 } // End anonymous namespace. 83 84 INITIALIZE_PASS(SIShrinkInstructionsLegacy, DEBUG_TYPE, 85 "SI Shrink Instructions", false, false) 86 87 char SIShrinkInstructionsLegacy::ID = 0; 88 89 FunctionPass *llvm::createSIShrinkInstructionsLegacyPass() { 90 return new SIShrinkInstructionsLegacy(); 91 } 92 93 /// This function checks \p MI for operands defined by a move immediate 94 /// instruction and then folds the literal constant into the instruction if it 95 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 96 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 97 bool TryToCommute) const { 98 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 99 100 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 101 102 // Try to fold Src0 103 MachineOperand &Src0 = MI.getOperand(Src0Idx); 104 if (Src0.isReg()) { 105 Register Reg = Src0.getReg(); 106 if (Reg.isVirtual()) { 107 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 108 if (Def && Def->isMoveImmediate()) { 109 MachineOperand &MovSrc = Def->getOperand(1); 110 bool ConstantFolded = false; 111 112 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 113 if (MovSrc.isImm()) { 114 Src0.ChangeToImmediate(MovSrc.getImm()); 115 ConstantFolded = true; 116 } else if (MovSrc.isFI()) { 117 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 118 ConstantFolded = true; 119 } else if (MovSrc.isGlobal()) { 120 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 121 MovSrc.getTargetFlags()); 122 ConstantFolded = true; 123 } 124 } 125 126 if (ConstantFolded) { 127 if (MRI->use_nodbg_empty(Reg)) 128 Def->eraseFromParent(); 129 ++NumLiteralConstantsFolded; 130 return true; 131 } 132 } 133 } 134 } 135 136 // We have failed to fold src0, so commute the instruction and try again. 137 if (TryToCommute && MI.isCommutable()) { 138 if (TII->commuteInstruction(MI)) { 139 if (foldImmediates(MI, false)) 140 return true; 141 142 // Commute back. 143 TII->commuteInstruction(MI); 144 } 145 } 146 147 return false; 148 } 149 150 /// Do not shrink the instruction if its registers are not expressible in the 151 /// shrunk encoding. 152 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { 153 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 154 const MachineOperand &MO = MI.getOperand(I); 155 if (MO.isReg()) { 156 Register Reg = MO.getReg(); 157 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " 158 "True16 Instructions post-RA"); 159 if (AMDGPU::VGPR_32RegClass.contains(Reg) && 160 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) 161 return false; 162 163 if (AMDGPU::VGPR_16RegClass.contains(Reg) && 164 !AMDGPU::VGPR_16_Lo128RegClass.contains(Reg)) 165 return false; 166 } 167 } 168 return true; 169 } 170 171 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 172 return isInt<16>(SignExtend64(Src.getImm(), 32)) && 173 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 174 } 175 176 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 177 return isUInt<16>(Src.getImm()) && 178 !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); 179 } 180 181 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 182 bool &IsUnsigned) const { 183 if (isInt<16>(SignExtend64(Src.getImm(), 32))) { 184 IsUnsigned = false; 185 return !TII->isInlineConstant(Src); 186 } 187 188 if (isUInt<16>(Src.getImm())) { 189 IsUnsigned = true; 190 return !TII->isInlineConstant(Src); 191 } 192 193 return false; 194 } 195 196 /// \returns the opcode of an instruction a move immediate of the constant \p 197 /// Src can be replaced with if the constant is replaced with \p ModifiedImm. 198 /// i.e. 199 /// 200 /// If the bitreverse of a constant is an inline immediate, reverse the 201 /// immediate and return the bitreverse opcode. 202 /// 203 /// If the bitwise negation of a constant is an inline immediate, reverse the 204 /// immediate and return the bitwise not opcode. 205 static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, 206 const MachineOperand &Src, 207 int32_t &ModifiedImm, bool Scalar) { 208 if (TII->isInlineConstant(Src)) 209 return 0; 210 int32_t SrcImm = static_cast<int32_t>(Src.getImm()); 211 212 if (!Scalar) { 213 // We could handle the scalar case with here, but we would need to check 214 // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth 215 // it, as the reasonable values are already covered by s_movk_i32. 216 ModifiedImm = ~SrcImm; 217 if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) 218 return AMDGPU::V_NOT_B32_e32; 219 } 220 221 ModifiedImm = reverseBits<int32_t>(SrcImm); 222 if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) 223 return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; 224 225 return 0; 226 } 227 228 /// Copy implicit register operands from specified instruction to this 229 /// instruction that are not part of the instruction definition. 230 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 231 MachineInstr &MI) const { 232 MachineFunction &MF = *MI.getMF(); 233 for (unsigned i = MI.getDesc().getNumOperands() + 234 MI.getDesc().implicit_uses().size() + 235 MI.getDesc().implicit_defs().size(), 236 e = MI.getNumOperands(); 237 i != e; ++i) { 238 const MachineOperand &MO = MI.getOperand(i); 239 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 240 NewMI.addOperand(MF, MO); 241 } 242 } 243 244 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 245 if (!ST->hasSCmpK()) 246 return; 247 248 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 249 // get constants on the RHS. 250 if (!MI.getOperand(0).isReg()) 251 TII->commuteInstruction(MI, false, 0, 1); 252 253 // cmpk requires src0 to be a register 254 const MachineOperand &Src0 = MI.getOperand(0); 255 if (!Src0.isReg()) 256 return; 257 258 MachineOperand &Src1 = MI.getOperand(1); 259 if (!Src1.isImm()) 260 return; 261 262 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 263 if (SOPKOpc == -1) 264 return; 265 266 // eq/ne is special because the imm16 can be treated as signed or unsigned, 267 // and initially selected to the unsigned versions. 268 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 269 bool HasUImm; 270 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 271 if (!HasUImm) { 272 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 273 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 274 Src1.setImm(SignExtend32(Src1.getImm(), 32)); 275 } 276 277 MI.setDesc(TII->get(SOPKOpc)); 278 } 279 280 return; 281 } 282 283 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 284 285 if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 286 (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 287 if (!SIInstrInfo::sopkIsZext(SOPKOpc)) 288 Src1.setImm(SignExtend64(Src1.getImm(), 32)); 289 MI.setDesc(NewDesc); 290 } 291 } 292 293 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 294 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 295 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 296 if (!Info) 297 return; 298 299 uint8_t NewEncoding; 300 switch (Info->MIMGEncoding) { 301 case AMDGPU::MIMGEncGfx10NSA: 302 NewEncoding = AMDGPU::MIMGEncGfx10Default; 303 break; 304 case AMDGPU::MIMGEncGfx11NSA: 305 NewEncoding = AMDGPU::MIMGEncGfx11Default; 306 break; 307 default: 308 return; 309 } 310 311 int VAddr0Idx = 312 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 313 unsigned NewAddrDwords = Info->VAddrDwords; 314 const TargetRegisterClass *RC; 315 316 if (Info->VAddrDwords == 2) { 317 RC = &AMDGPU::VReg_64RegClass; 318 } else if (Info->VAddrDwords == 3) { 319 RC = &AMDGPU::VReg_96RegClass; 320 } else if (Info->VAddrDwords == 4) { 321 RC = &AMDGPU::VReg_128RegClass; 322 } else if (Info->VAddrDwords == 5) { 323 RC = &AMDGPU::VReg_160RegClass; 324 } else if (Info->VAddrDwords == 6) { 325 RC = &AMDGPU::VReg_192RegClass; 326 } else if (Info->VAddrDwords == 7) { 327 RC = &AMDGPU::VReg_224RegClass; 328 } else if (Info->VAddrDwords == 8) { 329 RC = &AMDGPU::VReg_256RegClass; 330 } else if (Info->VAddrDwords == 9) { 331 RC = &AMDGPU::VReg_288RegClass; 332 } else if (Info->VAddrDwords == 10) { 333 RC = &AMDGPU::VReg_320RegClass; 334 } else if (Info->VAddrDwords == 11) { 335 RC = &AMDGPU::VReg_352RegClass; 336 } else if (Info->VAddrDwords == 12) { 337 RC = &AMDGPU::VReg_384RegClass; 338 } else { 339 RC = &AMDGPU::VReg_512RegClass; 340 NewAddrDwords = 16; 341 } 342 343 unsigned VgprBase = 0; 344 unsigned NextVgpr = 0; 345 bool IsUndef = true; 346 bool IsKill = NewAddrDwords == Info->VAddrDwords; 347 const unsigned NSAMaxSize = ST->getNSAMaxSize(); 348 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; 349 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; 350 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { 351 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 352 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 353 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 354 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 355 356 if (Idx == 0) { 357 VgprBase = Vgpr; 358 NextVgpr = Vgpr + Dwords; 359 } else if (Vgpr == NextVgpr) { 360 NextVgpr = Vgpr + Dwords; 361 } else { 362 return; 363 } 364 365 if (!Op.isUndef()) 366 IsUndef = false; 367 if (!Op.isKill()) 368 IsKill = false; 369 } 370 371 if (VgprBase + NewAddrDwords > 256) 372 return; 373 374 // Further check for implicit tied operands - this may be present if TFE is 375 // enabled 376 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 377 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 378 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 379 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 380 int ToUntie = -1; 381 if (TFEVal || LWEVal) { 382 // TFE/LWE is enabled so we need to deal with an implicit tied operand 383 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 384 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 385 MI.getOperand(i).isImplicit()) { 386 // This is the tied operand 387 assert( 388 ToUntie == -1 && 389 "found more than one tied implicit operand when expecting only 1"); 390 ToUntie = i; 391 MI.untieRegOperand(ToUntie); 392 } 393 } 394 } 395 396 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 397 Info->VDataDwords, NewAddrDwords); 398 MI.setDesc(TII->get(NewOpcode)); 399 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 400 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 401 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 402 403 for (unsigned i = 1; i < EndVAddr; ++i) 404 MI.removeOperand(VAddr0Idx + 1); 405 406 if (ToUntie >= 0) { 407 MI.tieOperands( 408 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 409 ToUntie - (EndVAddr - 1)); 410 } 411 } 412 413 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 414 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 415 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so 416 // there is no reason to try to shrink them. 417 if (!ST->hasVOP3Literal()) 418 return; 419 420 // There is no advantage to doing this pre-RA. 421 if (!IsPostRA) 422 return; 423 424 if (TII->hasAnyModifiersSet(MI)) 425 return; 426 427 const unsigned Opcode = MI.getOpcode(); 428 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 429 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 430 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 431 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 432 433 bool Swap; 434 435 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 436 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 437 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 438 Swap = false; 439 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 440 Swap = true; 441 else 442 return; 443 444 switch (Opcode) { 445 default: 446 llvm_unreachable("Unexpected mad/fma opcode!"); 447 case AMDGPU::V_MAD_F32_e64: 448 NewOpcode = AMDGPU::V_MADAK_F32; 449 break; 450 case AMDGPU::V_FMA_F32_e64: 451 NewOpcode = AMDGPU::V_FMAAK_F32; 452 break; 453 case AMDGPU::V_MAD_F16_e64: 454 NewOpcode = AMDGPU::V_MADAK_F16; 455 break; 456 case AMDGPU::V_FMA_F16_e64: 457 case AMDGPU::V_FMA_F16_gfx9_e64: 458 NewOpcode = AMDGPU::V_FMAAK_F16; 459 break; 460 case AMDGPU::V_FMA_F16_gfx9_t16_e64: 461 NewOpcode = AMDGPU::V_FMAAK_F16_t16; 462 break; 463 case AMDGPU::V_FMA_F16_gfx9_fake16_e64: 464 NewOpcode = AMDGPU::V_FMAAK_F16_fake16; 465 break; 466 case AMDGPU::V_FMA_F64_e64: 467 if (ST->hasFmaakFmamkF64Insts()) 468 NewOpcode = AMDGPU::V_FMAAK_F64; 469 break; 470 } 471 } 472 473 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 474 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 475 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 476 Swap = false; 477 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 478 Swap = true; 479 else 480 return; 481 482 switch (Opcode) { 483 default: 484 llvm_unreachable("Unexpected mad/fma opcode!"); 485 case AMDGPU::V_MAD_F32_e64: 486 NewOpcode = AMDGPU::V_MADMK_F32; 487 break; 488 case AMDGPU::V_FMA_F32_e64: 489 NewOpcode = AMDGPU::V_FMAMK_F32; 490 break; 491 case AMDGPU::V_MAD_F16_e64: 492 NewOpcode = AMDGPU::V_MADMK_F16; 493 break; 494 case AMDGPU::V_FMA_F16_e64: 495 case AMDGPU::V_FMA_F16_gfx9_e64: 496 NewOpcode = AMDGPU::V_FMAMK_F16; 497 break; 498 case AMDGPU::V_FMA_F16_gfx9_t16_e64: 499 NewOpcode = AMDGPU::V_FMAMK_F16_t16; 500 break; 501 case AMDGPU::V_FMA_F16_gfx9_fake16_e64: 502 NewOpcode = AMDGPU::V_FMAMK_F16_fake16; 503 break; 504 case AMDGPU::V_FMA_F64_e64: 505 if (ST->hasFmaakFmamkF64Insts()) 506 NewOpcode = AMDGPU::V_FMAMK_F64; 507 break; 508 } 509 } 510 511 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 512 return; 513 514 if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) 515 return; 516 517 if (Swap) { 518 // Swap Src0 and Src1 by building a new instruction. 519 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 520 MI.getOperand(0).getReg()) 521 .add(Src1) 522 .add(Src0) 523 .add(Src2) 524 .setMIFlags(MI.getFlags()); 525 MI.eraseFromParent(); 526 } else { 527 TII->removeModOperands(MI); 528 MI.setDesc(TII->get(NewOpcode)); 529 } 530 } 531 532 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. 533 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 534 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 535 /// XNOR (as a ^ b == ~(a ^ ~b)). 536 /// \returns true if the caller should continue the machine function iterator 537 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 538 unsigned Opc = MI.getOpcode(); 539 const MachineOperand *Dest = &MI.getOperand(0); 540 MachineOperand *Src0 = &MI.getOperand(1); 541 MachineOperand *Src1 = &MI.getOperand(2); 542 MachineOperand *SrcReg = Src0; 543 MachineOperand *SrcImm = Src1; 544 545 if (!SrcImm->isImm() || 546 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 547 return false; 548 549 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 550 uint32_t NewImm = 0; 551 552 if (Opc == AMDGPU::S_AND_B32) { 553 if (isPowerOf2_32(~Imm)) { 554 NewImm = llvm::countr_one(Imm); 555 Opc = AMDGPU::S_BITSET0_B32; 556 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 557 NewImm = ~Imm; 558 Opc = AMDGPU::S_ANDN2_B32; 559 } 560 } else if (Opc == AMDGPU::S_OR_B32) { 561 if (isPowerOf2_32(Imm)) { 562 NewImm = llvm::countr_zero(Imm); 563 Opc = AMDGPU::S_BITSET1_B32; 564 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 565 NewImm = ~Imm; 566 Opc = AMDGPU::S_ORN2_B32; 567 } 568 } else if (Opc == AMDGPU::S_XOR_B32) { 569 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 570 NewImm = ~Imm; 571 Opc = AMDGPU::S_XNOR_B32; 572 } 573 } else { 574 llvm_unreachable("unexpected opcode"); 575 } 576 577 if (NewImm != 0) { 578 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 579 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 580 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 581 return true; 582 } 583 584 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 585 const bool IsUndef = SrcReg->isUndef(); 586 const bool IsKill = SrcReg->isKill(); 587 MI.setDesc(TII->get(Opc)); 588 if (Opc == AMDGPU::S_BITSET0_B32 || 589 Opc == AMDGPU::S_BITSET1_B32) { 590 Src0->ChangeToImmediate(NewImm); 591 // Remove the immediate and add the tied input. 592 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 593 /*isImp*/ false, IsKill, 594 /*isDead*/ false, IsUndef); 595 MI.tieOperands(0, 2); 596 } else { 597 SrcImm->setImm(NewImm); 598 } 599 } 600 } 601 602 return false; 603 } 604 605 // This is the same as MachineInstr::readsRegister/modifiesRegister except 606 // it takes subregs into account. 607 bool SIShrinkInstructions::instAccessReg( 608 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 609 unsigned SubReg) const { 610 for (const MachineOperand &MO : R) { 611 if (!MO.isReg()) 612 continue; 613 614 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 615 if (TRI->regsOverlap(Reg, MO.getReg())) 616 return true; 617 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 618 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 619 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 620 if (Overlap.any()) 621 return true; 622 } 623 } 624 return false; 625 } 626 627 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 628 unsigned SubReg) const { 629 return instAccessReg(MI->uses(), Reg, SubReg); 630 } 631 632 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 633 unsigned SubReg) const { 634 return instAccessReg(MI->defs(), Reg, SubReg); 635 } 636 637 TargetInstrInfo::RegSubRegPair 638 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 639 unsigned I) const { 640 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 641 if (Reg.isPhysical()) { 642 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 643 } else { 644 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 645 } 646 } 647 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 648 } 649 650 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 651 MachineInstr &MI) const { 652 for (unsigned i = MI.getDesc().getNumOperands() + 653 MI.getDesc().implicit_uses().size() + 654 MI.getDesc().implicit_defs().size(), 655 e = MI.getNumOperands(); 656 i != e; ++i) { 657 const MachineOperand &Op = MI.getOperand(i); 658 if (!Op.isDef()) 659 continue; 660 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 661 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 662 } 663 664 MI.eraseFromParent(); 665 } 666 667 // Match: 668 // mov t, x 669 // mov x, y 670 // mov y, t 671 // 672 // => 673 // 674 // mov t, x (t is potentially dead and move eliminated) 675 // v_swap_b32 x, y 676 // 677 // Returns next valid instruction pointer if was able to create v_swap_b32. 678 // 679 // This shall not be done too early not to prevent possible folding which may 680 // remove matched moves, and this should preferably be done before RA to 681 // release saved registers and also possibly after RA which can insert copies 682 // too. 683 // 684 // This is really just a generic peephole that is not a canonical shrinking, 685 // although requirements match the pass placement and it reduces code size too. 686 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 687 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 688 MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || 689 MovT.getOpcode() == AMDGPU::COPY); 690 691 Register T = MovT.getOperand(0).getReg(); 692 unsigned Tsub = MovT.getOperand(0).getSubReg(); 693 MachineOperand &Xop = MovT.getOperand(1); 694 695 if (!Xop.isReg()) 696 return nullptr; 697 Register X = Xop.getReg(); 698 unsigned Xsub = Xop.getSubReg(); 699 700 unsigned Size = TII->getOpSize(MovT, 0); 701 702 // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers 703 // are not allocatble. 704 if (Size == 2 && X.isVirtual()) 705 return nullptr; 706 707 if (!TRI->isVGPR(*MRI, X)) 708 return nullptr; 709 710 const unsigned SearchLimit = 16; 711 unsigned Count = 0; 712 bool KilledT = false; 713 for (auto Iter = std::next(MovT.getIterator()), 714 E = MovT.getParent()->instr_end(); 715 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 716 717 MachineInstr *MovY = &*Iter; 718 KilledT = MovY->killsRegister(T, TRI); 719 720 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 721 MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && 722 MovY->getOpcode() != AMDGPU::COPY) || 723 !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T || 724 MovY->getOperand(1).getSubReg() != Tsub) 725 continue; 726 727 Register Y = MovY->getOperand(0).getReg(); 728 unsigned Ysub = MovY->getOperand(0).getSubReg(); 729 730 if (!TRI->isVGPR(*MRI, Y)) 731 continue; 732 733 MachineInstr *MovX = nullptr; 734 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 735 I != IY; ++I) { 736 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 737 instModifiesReg(&*I, T, Tsub) || 738 (MovX && instModifiesReg(&*I, X, Xsub))) { 739 MovX = nullptr; 740 break; 741 } 742 if (!instReadsReg(&*I, Y, Ysub)) { 743 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 744 MovX = nullptr; 745 break; 746 } 747 continue; 748 } 749 if (MovX || 750 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 751 I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && 752 I->getOpcode() != AMDGPU::COPY) || 753 I->getOperand(0).getReg() != X || 754 I->getOperand(0).getSubReg() != Xsub) { 755 MovX = nullptr; 756 break; 757 } 758 759 if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 760 continue; 761 762 MovX = &*I; 763 } 764 765 if (!MovX) 766 continue; 767 768 LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY); 769 770 MachineBasicBlock &MBB = *MovT.getParent(); 771 SmallVector<MachineInstr *, 4> Swaps; 772 if (Size == 2) { 773 auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 774 TII->get(AMDGPU::V_SWAP_B16)) 775 .addDef(X) 776 .addDef(Y) 777 .addReg(Y) 778 .addReg(X) 779 .getInstr(); 780 Swaps.push_back(MIB); 781 } else { 782 assert(Size > 0 && Size % 4 == 0); 783 for (unsigned I = 0; I < Size / 4; ++I) { 784 TargetInstrInfo::RegSubRegPair X1, Y1; 785 X1 = getSubRegForIndex(X, Xsub, I); 786 Y1 = getSubRegForIndex(Y, Ysub, I); 787 auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 788 TII->get(AMDGPU::V_SWAP_B32)) 789 .addDef(X1.Reg, 0, X1.SubReg) 790 .addDef(Y1.Reg, 0, Y1.SubReg) 791 .addReg(Y1.Reg, 0, Y1.SubReg) 792 .addReg(X1.Reg, 0, X1.SubReg) 793 .getInstr(); 794 Swaps.push_back(MIB); 795 } 796 } 797 // Drop implicit EXEC. 798 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 799 for (MachineInstr *Swap : Swaps) { 800 Swap->removeOperand(Swap->getNumExplicitOperands()); 801 Swap->copyImplicitOps(*MBB.getParent(), *MovX); 802 } 803 } 804 MovX->eraseFromParent(); 805 dropInstructionKeepingImpDefs(*MovY); 806 MachineInstr *Next = &*std::next(MovT.getIterator()); 807 808 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 809 dropInstructionKeepingImpDefs(MovT); 810 } else { 811 Xop.setIsKill(false); 812 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 813 unsigned OpNo = MovT.getNumExplicitOperands() + I; 814 const MachineOperand &Op = MovT.getOperand(OpNo); 815 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 816 MovT.removeOperand(OpNo); 817 } 818 } 819 820 return Next; 821 } 822 823 return nullptr; 824 } 825 826 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 827 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 828 if (!ST->hasGFX10_3Insts()) 829 return false; 830 831 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 832 if (!Op) 833 return false; 834 Register SDstReg = Op->getReg(); 835 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 836 return false; 837 838 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 839 return true; 840 } 841 842 bool SIShrinkInstructions::run(MachineFunction &MF) { 843 844 this->MF = &MF; 845 MRI = &MF.getRegInfo(); 846 ST = &MF.getSubtarget<GCNSubtarget>(); 847 TII = ST->getInstrInfo(); 848 TRI = &TII->getRegisterInfo(); 849 IsPostRA = MF.getProperties().hasNoVRegs(); 850 851 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 852 853 for (MachineBasicBlock &MBB : MF) { 854 MachineBasicBlock::iterator I, Next; 855 for (I = MBB.begin(); I != MBB.end(); I = Next) { 856 Next = std::next(I); 857 MachineInstr &MI = *I; 858 859 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 860 // If this has a literal constant source that is the same as the 861 // reversed bits of an inline immediate, replace with a bitreverse of 862 // that constant. This saves 4 bytes in the common case of materializing 863 // sign bits. 864 865 // Test if we are after regalloc. We only want to do this after any 866 // optimizations happen because this will confuse them. 867 MachineOperand &Src = MI.getOperand(1); 868 if (Src.isImm() && IsPostRA) { 869 int32_t ModImm; 870 unsigned ModOpcode = 871 canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false); 872 if (ModOpcode != 0) { 873 MI.setDesc(TII->get(ModOpcode)); 874 Src.setImm(static_cast<int64_t>(ModImm)); 875 continue; 876 } 877 } 878 } 879 880 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 881 MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || 882 MI.getOpcode() == AMDGPU::COPY)) { 883 if (auto *NextMI = matchSwap(MI)) { 884 Next = NextMI->getIterator(); 885 continue; 886 } 887 } 888 889 // Try to use S_ADDK_I32 and S_MULK_I32. 890 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 891 MI.getOpcode() == AMDGPU::S_MUL_I32) { 892 const MachineOperand *Dest = &MI.getOperand(0); 893 MachineOperand *Src0 = &MI.getOperand(1); 894 MachineOperand *Src1 = &MI.getOperand(2); 895 896 if (!Src0->isReg() && Src1->isReg()) { 897 if (TII->commuteInstruction(MI, false, 1, 2)) 898 std::swap(Src0, Src1); 899 } 900 901 // FIXME: This could work better if hints worked with subregisters. If 902 // we have a vector add of a constant, we usually don't get the correct 903 // allocation due to the subregister usage. 904 if (Dest->getReg().isVirtual() && Src0->isReg()) { 905 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 906 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 907 continue; 908 } 909 910 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 911 if (Src1->isImm() && isKImmOperand(*Src1)) { 912 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 913 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 914 915 Src1->setImm(SignExtend64(Src1->getImm(), 32)); 916 MI.setDesc(TII->get(Opc)); 917 MI.tieOperands(0, 1); 918 } 919 } 920 } 921 922 // Try to use s_cmpk_* 923 if (MI.isCompare() && TII->isSOPC(MI)) { 924 shrinkScalarCompare(MI); 925 continue; 926 } 927 928 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 929 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 930 const MachineOperand &Dst = MI.getOperand(0); 931 MachineOperand &Src = MI.getOperand(1); 932 933 if (Src.isImm() && Dst.getReg().isPhysical()) { 934 unsigned ModOpc; 935 int32_t ModImm; 936 if (isKImmOperand(Src)) { 937 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 938 Src.setImm(SignExtend64(Src.getImm(), 32)); 939 } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, 940 /*Scalar=*/true))) { 941 MI.setDesc(TII->get(ModOpc)); 942 Src.setImm(static_cast<int64_t>(ModImm)); 943 } 944 } 945 946 continue; 947 } 948 949 // Shrink scalar logic operations. 950 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 951 MI.getOpcode() == AMDGPU::S_OR_B32 || 952 MI.getOpcode() == AMDGPU::S_XOR_B32) { 953 if (shrinkScalarLogicOp(MI)) 954 continue; 955 } 956 957 if (IsPostRA && TII->isMIMG(MI.getOpcode()) && 958 ST->getGeneration() >= AMDGPUSubtarget::GFX10) { 959 shrinkMIMG(MI); 960 continue; 961 } 962 963 if (!TII->isVOP3(MI)) 964 continue; 965 966 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 967 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 968 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 969 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || 970 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || 971 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || 972 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 || 973 (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 && 974 ST->hasFmaakFmamkF64Insts())) { 975 shrinkMadFma(MI); 976 continue; 977 } 978 979 // If there is no chance we will shrink it and use VCC as sdst to get 980 // a 32 bit form try to replace dead sdst with NULL. 981 if (TII->isVOP3(MI.getOpcode())) { 982 tryReplaceDeadSDST(MI); 983 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 984 continue; 985 } 986 } 987 988 if (!TII->canShrink(MI, *MRI)) { 989 // Try commuting the instruction and see if that enables us to shrink 990 // it. 991 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 992 !TII->canShrink(MI, *MRI)) { 993 tryReplaceDeadSDST(MI); 994 continue; 995 } 996 } 997 998 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 999 1000 if (TII->isVOPC(Op32)) { 1001 MachineOperand &Op0 = MI.getOperand(0); 1002 if (Op0.isReg()) { 1003 // Exclude VOPCX instructions as these don't explicitly write a 1004 // dst. 1005 Register DstReg = Op0.getReg(); 1006 if (DstReg.isVirtual()) { 1007 // VOPC instructions can only write to the VCC register. We can't 1008 // force them to use VCC here, because this is only one register and 1009 // cannot deal with sequences which would require multiple copies of 1010 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 1011 // 1012 // So, instead of forcing the instruction to write to VCC, we 1013 // provide a hint to the register allocator to use VCC and then we 1014 // will run this pass again after RA and shrink it if it outputs to 1015 // VCC. 1016 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 1017 continue; 1018 } 1019 if (DstReg != VCCReg) 1020 continue; 1021 } 1022 } 1023 1024 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 1025 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 1026 // instructions. 1027 const MachineOperand *Src2 = 1028 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1029 if (!Src2->isReg()) 1030 continue; 1031 Register SReg = Src2->getReg(); 1032 if (SReg.isVirtual()) { 1033 MRI->setRegAllocationHint(SReg, 0, VCCReg); 1034 continue; 1035 } 1036 if (SReg != VCCReg) 1037 continue; 1038 } 1039 1040 // Check for the bool flag output for instructions like V_ADD_I32_e64. 1041 const MachineOperand *SDst = TII->getNamedOperand(MI, 1042 AMDGPU::OpName::sdst); 1043 1044 if (SDst) { 1045 bool Next = false; 1046 1047 if (SDst->getReg() != VCCReg) { 1048 if (SDst->getReg().isVirtual()) 1049 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 1050 Next = true; 1051 } 1052 1053 // All of the instructions with carry outs also have an SGPR input in 1054 // src2. 1055 const MachineOperand *Src2 = TII->getNamedOperand(MI, 1056 AMDGPU::OpName::src2); 1057 if (Src2 && Src2->getReg() != VCCReg) { 1058 if (Src2->getReg().isVirtual()) 1059 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 1060 Next = true; 1061 } 1062 1063 if (Next) 1064 continue; 1065 } 1066 1067 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to 1068 // fold an immediate into the shrunk instruction as a literal operand. In 1069 // GFX10 VOP3 instructions can take a literal operand anyway, so there is 1070 // no advantage to doing this. 1071 // However, if 64-bit literals are allowed we still need to shrink it 1072 // for such literal to be able to fold. 1073 if (ST->hasVOP3Literal() && 1074 (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) && 1075 !IsPostRA) 1076 continue; 1077 1078 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && 1079 !shouldShrinkTrue16(MI)) 1080 continue; 1081 1082 // We can shrink this instruction 1083 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 1084 1085 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 1086 ++NumInstructionsShrunk; 1087 1088 // Copy extra operands not present in the instruction definition. 1089 copyExtraImplicitOps(*Inst32, MI); 1090 1091 // Copy deadness from the old explicit vcc def to the new implicit def. 1092 if (SDst && SDst->isDead()) 1093 Inst32->findRegisterDefOperand(VCCReg, /*TRI=*/nullptr)->setIsDead(); 1094 1095 MI.eraseFromParent(); 1096 foldImmediates(*Inst32); 1097 1098 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 1099 } 1100 } 1101 return false; 1102 } 1103 1104 bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) { 1105 if (skipFunction(MF.getFunction())) 1106 return false; 1107 1108 return SIShrinkInstructions().run(MF); 1109 } 1110 1111 PreservedAnalyses 1112 SIShrinkInstructionsPass::run(MachineFunction &MF, 1113 MachineFunctionAnalysisManager &) { 1114 if (MF.getFunction().hasOptNone() || !SIShrinkInstructions().run(MF)) 1115 return PreservedAnalyses::all(); 1116 1117 auto PA = getMachineFunctionPassPreservedAnalyses(); 1118 PA.preserveSet<CFGAnalyses>(); 1119 return PA; 1120 } 1121