1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_CO_U32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_CO_U32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "SIPeepholeSDWA.h" 23 #include "AMDGPU.h" 24 #include "GCNSubtarget.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/Statistic.h" 28 #include "llvm/CodeGen/MachineFunctionPass.h" 29 #include <optional> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "si-peephole-sdwa" 34 35 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 36 STATISTIC(NumSDWAInstructionsPeepholed, 37 "Number of instruction converted to SDWA."); 38 39 namespace { 40 41 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, 42 const SIInstrInfo *TII); 43 class SDWAOperand; 44 class SDWADstOperand; 45 46 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 47 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; 48 49 class SIPeepholeSDWA { 50 private: 51 MachineRegisterInfo *MRI; 52 const SIRegisterInfo *TRI; 53 const SIInstrInfo *TII; 54 55 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 56 SDWAOperandsMap PotentialMatches; 57 SmallVector<MachineInstr *, 8> ConvertedInstructions; 58 59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const; 60 61 void matchSDWAOperands(MachineBasicBlock &MBB); 62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 63 void pseudoOpConvertToVOP2(MachineInstr &MI, 64 const GCNSubtarget &ST) const; 65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; 66 MachineInstr *createSDWAVersion(MachineInstr &MI); 67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 69 70 public: 71 bool run(MachineFunction &MF); 72 }; 73 74 class SIPeepholeSDWALegacy : public MachineFunctionPass { 75 public: 76 static char ID; 77 78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} 79 80 StringRef getPassName() const override { return "SI Peephole SDWA"; } 81 82 bool runOnMachineFunction(MachineFunction &MF) override; 83 84 void getAnalysisUsage(AnalysisUsage &AU) const override { 85 AU.setPreservesCFG(); 86 MachineFunctionPass::getAnalysisUsage(AU); 87 } 88 }; 89 90 using namespace AMDGPU::SDWA; 91 92 class SDWAOperand { 93 private: 94 MachineOperand *Target; // Operand that would be used in converted instruction 95 MachineOperand *Replaced; // Operand that would be replace by Target 96 97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined 98 /// with the SDWA selections of its uses in \p MI. 99 virtual bool canCombineSelections(const MachineInstr &MI, 100 const SIInstrInfo *TII) = 0; 101 102 public: 103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 104 : Target(TargetOp), Replaced(ReplacedOp) { 105 assert(Target->isReg()); 106 assert(Replaced->isReg()); 107 } 108 109 virtual ~SDWAOperand() = default; 110 111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, 112 const GCNSubtarget &ST, 113 SDWAOperandsMap *PotentialMatches = nullptr) = 0; 114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 115 116 MachineOperand *getTargetOperand() const { return Target; } 117 MachineOperand *getReplacedOperand() const { return Replaced; } 118 MachineInstr *getParentInst() const { return Target->getParent(); } 119 120 MachineRegisterInfo *getMRI() const { 121 return &getParentInst()->getParent()->getParent()->getRegInfo(); 122 } 123 124 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 125 virtual void print(raw_ostream& OS) const = 0; 126 void dump() const { print(dbgs()); } 127 #endif 128 }; 129 130 class SDWASrcOperand : public SDWAOperand { 131 private: 132 SdwaSel SrcSel; 133 bool Abs; 134 bool Neg; 135 bool Sext; 136 137 public: 138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 140 bool Sext_ = false) 141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), 142 Neg(Neg_), Sext(Sext_) {} 143 144 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 145 const GCNSubtarget &ST, 146 SDWAOperandsMap *PotentialMatches = nullptr) override; 147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 148 bool canCombineSelections(const MachineInstr &MI, 149 const SIInstrInfo *TII) override; 150 151 SdwaSel getSrcSel() const { return SrcSel; } 152 bool getAbs() const { return Abs; } 153 bool getNeg() const { return Neg; } 154 bool getSext() const { return Sext; } 155 156 uint64_t getSrcMods(const SIInstrInfo *TII, 157 const MachineOperand *SrcOp) const; 158 159 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 160 void print(raw_ostream& OS) const override; 161 #endif 162 }; 163 164 class SDWADstOperand : public SDWAOperand { 165 private: 166 SdwaSel DstSel; 167 DstUnused DstUn; 168 169 public: 170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 173 174 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 175 const GCNSubtarget &ST, 176 SDWAOperandsMap *PotentialMatches = nullptr) override; 177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 178 bool canCombineSelections(const MachineInstr &MI, 179 const SIInstrInfo *TII) override; 180 181 SdwaSel getDstSel() const { return DstSel; } 182 DstUnused getDstUnused() const { return DstUn; } 183 184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 185 void print(raw_ostream& OS) const override; 186 #endif 187 }; 188 189 class SDWADstPreserveOperand : public SDWADstOperand { 190 private: 191 MachineOperand *Preserve; 192 193 public: 194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 197 Preserve(PreserveOp) {} 198 199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 200 bool canCombineSelections(const MachineInstr &MI, 201 const SIInstrInfo *TII) override; 202 203 MachineOperand *getPreservedOperand() const { return Preserve; } 204 205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 206 void print(raw_ostream& OS) const override; 207 #endif 208 }; 209 210 } // end anonymous namespace 211 212 INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false, 213 false) 214 215 char SIPeepholeSDWALegacy::ID = 0; 216 217 char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; 218 219 FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { 220 return new SIPeepholeSDWALegacy(); 221 } 222 223 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 224 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 225 switch(Sel) { 226 case BYTE_0: OS << "BYTE_0"; break; 227 case BYTE_1: OS << "BYTE_1"; break; 228 case BYTE_2: OS << "BYTE_2"; break; 229 case BYTE_3: OS << "BYTE_3"; break; 230 case WORD_0: OS << "WORD_0"; break; 231 case WORD_1: OS << "WORD_1"; break; 232 case DWORD: OS << "DWORD"; break; 233 } 234 return OS; 235 } 236 237 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 238 switch(Un) { 239 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 242 } 243 return OS; 244 } 245 246 LLVM_DUMP_METHOD 247 void SDWASrcOperand::print(raw_ostream& OS) const { 248 OS << "SDWA src: " << *getTargetOperand() 249 << " src_sel:" << getSrcSel() 250 << " abs:" << getAbs() << " neg:" << getNeg() 251 << " sext:" << getSext() << '\n'; 252 } 253 254 LLVM_DUMP_METHOD 255 void SDWADstOperand::print(raw_ostream& OS) const { 256 OS << "SDWA dst: " << *getTargetOperand() 257 << " dst_sel:" << getDstSel() 258 << " dst_unused:" << getDstUnused() << '\n'; 259 } 260 261 LLVM_DUMP_METHOD 262 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 263 OS << "SDWA preserve dst: " << *getTargetOperand() 264 << " dst_sel:" << getDstSel() 265 << " preserve:" << *getPreservedOperand() << '\n'; 266 } 267 268 #endif 269 270 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 271 assert(To.isReg() && From.isReg()); 272 To.setReg(From.getReg()); 273 To.setSubReg(From.getSubReg()); 274 To.setIsUndef(From.isUndef()); 275 if (To.isUse()) { 276 To.setIsKill(From.isKill()); 277 } else { 278 To.setIsDead(From.isDead()); 279 } 280 } 281 282 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 283 return LHS.isReg() && 284 RHS.isReg() && 285 LHS.getReg() == RHS.getReg() && 286 LHS.getSubReg() == RHS.getSubReg(); 287 } 288 289 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 290 const MachineRegisterInfo *MRI) { 291 if (!Reg->isReg() || !Reg->isDef()) 292 return nullptr; 293 294 MachineOperand *ResMO = nullptr; 295 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 296 // If there exist use of subreg of Reg then return nullptr 297 if (!isSameReg(UseMO, *Reg)) 298 return nullptr; 299 300 // Check that there is only one instruction that uses Reg 301 if (!ResMO) { 302 ResMO = &UseMO; 303 } else if (ResMO->getParent() != UseMO.getParent()) { 304 return nullptr; 305 } 306 } 307 308 return ResMO; 309 } 310 311 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 312 const MachineRegisterInfo *MRI) { 313 if (!Reg->isReg()) 314 return nullptr; 315 316 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 317 if (!DefInstr) 318 return nullptr; 319 320 for (auto &DefMO : DefInstr->defs()) { 321 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 322 return &DefMO; 323 } 324 325 // Ignore implicit defs. 326 return nullptr; 327 } 328 329 /// Combine an SDWA instruction's existing SDWA selection \p Sel with 330 /// the SDWA selection \p OperandSel of its operand. If the selections 331 /// are compatible, return the combined selection, otherwise return a 332 /// nullopt. 333 /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: 334 /// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) 335 static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { 336 if (Sel == SdwaSel::DWORD) 337 return OperandSel; 338 339 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) 340 return Sel; 341 342 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || 343 Sel == SdwaSel::BYTE_3) 344 return {}; 345 346 if (OperandSel == SdwaSel::WORD_0) 347 return Sel; 348 349 if (OperandSel == SdwaSel::WORD_1) { 350 if (Sel == SdwaSel::BYTE_0) 351 return SdwaSel::BYTE_2; 352 if (Sel == SdwaSel::BYTE_1) 353 return SdwaSel::BYTE_3; 354 if (Sel == SdwaSel::WORD_0) 355 return SdwaSel::WORD_1; 356 } 357 358 return {}; 359 } 360 361 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 362 const MachineOperand *SrcOp) const { 363 uint64_t Mods = 0; 364 const auto *MI = SrcOp->getParent(); 365 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 366 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 367 Mods = Mod->getImm(); 368 } 369 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 370 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 371 Mods = Mod->getImm(); 372 } 373 } 374 if (Abs || Neg) { 375 assert(!Sext && 376 "Float and integer src modifiers can't be set simultaneously"); 377 Mods |= Abs ? SISrcMods::ABS : 0u; 378 Mods ^= Neg ? SISrcMods::NEG : 0u; 379 } else if (Sext) { 380 Mods |= SISrcMods::SEXT; 381 } 382 383 return Mods; 384 } 385 386 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, 387 const GCNSubtarget &ST, 388 SDWAOperandsMap *PotentialMatches) { 389 if (PotentialMatches != nullptr) { 390 // Fill out the map for all uses if all can be converted 391 MachineOperand *Reg = getReplacedOperand(); 392 if (!Reg->isReg() || !Reg->isDef()) 393 return nullptr; 394 395 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) 396 // Check that all instructions that use Reg can be converted 397 if (!isConvertibleToSDWA(UseMI, ST, TII) || 398 !canCombineSelections(UseMI, TII)) 399 return nullptr; 400 401 // Now that it's guaranteed all uses are legal, iterate over the uses again 402 // to add them for later conversion. 403 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { 404 // Should not get a subregister here 405 assert(isSameReg(UseMO, *Reg)); 406 407 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; 408 MachineInstr *UseMI = UseMO.getParent(); 409 potentialMatchesMap[UseMI].push_back(this); 410 } 411 return nullptr; 412 } 413 414 // For SDWA src operand potential instruction is one that use register 415 // defined by parent instruction 416 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 417 if (!PotentialMO) 418 return nullptr; 419 420 MachineInstr *Parent = PotentialMO->getParent(); 421 422 return canCombineSelections(*Parent, TII) ? Parent : nullptr; 423 } 424 425 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 426 switch (MI.getOpcode()) { 427 case AMDGPU::V_CVT_F32_FP8_sdwa: 428 case AMDGPU::V_CVT_F32_BF8_sdwa: 429 case AMDGPU::V_CVT_PK_F32_FP8_sdwa: 430 case AMDGPU::V_CVT_PK_F32_BF8_sdwa: 431 // Does not support input modifiers: noabs, noneg, nosext. 432 return false; 433 case AMDGPU::V_CNDMASK_B32_sdwa: 434 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and 435 // hence the compiler can only support one type of modifier for 436 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG 437 // since its operands get printed using 438 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces 439 // the output intended for NEG if SEXT is set. 440 // 441 // The ISA does actually support both modifiers on most SDWA 442 // instructions. 443 // 444 // FIXME Accept SEXT here after fixing this issue. 445 if (Sext) 446 return false; 447 break; 448 } 449 450 // Find operand in instruction that matches source operand and replace it with 451 // target operand. Set corresponding src_sel 452 bool IsPreserveSrc = false; 453 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 454 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 455 MachineOperand *SrcMods = 456 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 457 assert(Src && (Src->isReg() || Src->isImm())); 458 if (!isSameReg(*Src, *getReplacedOperand())) { 459 // If this is not src0 then it could be src1 460 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 461 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 462 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 463 464 if (!Src || 465 !isSameReg(*Src, *getReplacedOperand())) { 466 // It's possible this Src is a tied operand for 467 // UNUSED_PRESERVE, in which case we can either 468 // abandon the peephole attempt, or if legal we can 469 // copy the target operand into the tied slot 470 // if the preserve operation will effectively cause the same 471 // result by overwriting the rest of the dst. 472 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 473 MachineOperand *DstUnused = 474 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 475 476 if (Dst && 477 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 478 // This will work if the tied src is accessing WORD_0, and the dst is 479 // writing WORD_1. Modifiers don't matter because all the bits that 480 // would be impacted are being overwritten by the dst. 481 // Any other case will not work. 482 SdwaSel DstSel = static_cast<SdwaSel>( 483 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 484 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 485 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 486 IsPreserveSrc = true; 487 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 488 AMDGPU::OpName::vdst); 489 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 490 Src = &MI.getOperand(TiedIdx); 491 SrcSel = nullptr; 492 SrcMods = nullptr; 493 } else { 494 // Not legal to convert this src 495 return false; 496 } 497 } 498 } 499 assert(Src && Src->isReg()); 500 501 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 502 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 503 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 504 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 505 !isSameReg(*Src, *getReplacedOperand())) { 506 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 507 // src2. This is not allowed. 508 return false; 509 } 510 511 assert(isSameReg(*Src, *getReplacedOperand()) && 512 (IsPreserveSrc || (SrcSel && SrcMods))); 513 } 514 copyRegOperand(*Src, *getTargetOperand()); 515 if (!IsPreserveSrc) { 516 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm()); 517 SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel())); 518 SrcMods->setImm(getSrcMods(TII, Src)); 519 } 520 getTargetOperand()->setIsKill(false); 521 return true; 522 } 523 524 /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA 525 /// instruction \p MI can be combined with the selection \p OpSel. 526 static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, 527 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { 528 assert(TII->isSDWA(MI.getOpcode())); 529 530 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName); 531 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm()); 532 533 return combineSdwaSel(SrcSel, OpSel).has_value(); 534 } 535 536 /// Verify that \p Op is the same register as the operand of the SDWA 537 /// instruction \p MI named by \p SrcOpName and that the SDWA 538 /// selection \p SrcSelOpName can be combined with the \p OpSel. 539 static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, 540 AMDGPU::OpName SrcOpName, 541 AMDGPU::OpName SrcSelOpName, MachineOperand *Op, 542 SdwaSel OpSel) { 543 assert(TII->isSDWA(MI.getOpcode())); 544 545 const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName); 546 if (!Src || !isSameReg(*Src, *Op)) 547 return true; 548 549 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); 550 } 551 552 bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, 553 const SIInstrInfo *TII) { 554 if (!TII->isSDWA(MI.getOpcode())) 555 return true; 556 557 using namespace AMDGPU; 558 559 return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel, 560 getReplacedOperand(), getSrcSel()) && 561 canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel, 562 getReplacedOperand(), getSrcSel()); 563 } 564 565 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, 566 const GCNSubtarget &ST, 567 SDWAOperandsMap *PotentialMatches) { 568 // For SDWA dst operand potential instruction is one that defines register 569 // that this operand uses 570 MachineRegisterInfo *MRI = getMRI(); 571 MachineInstr *ParentMI = getParentInst(); 572 573 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 574 if (!PotentialMO) 575 return nullptr; 576 577 // Check that ParentMI is the only instruction that uses replaced register 578 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 579 if (&UseInst != ParentMI) 580 return nullptr; 581 } 582 583 MachineInstr *Parent = PotentialMO->getParent(); 584 return canCombineSelections(*Parent, TII) ? Parent : nullptr; 585 } 586 587 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 588 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 589 590 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 591 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 592 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 593 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 594 getDstSel() != AMDGPU::SDWA::DWORD) { 595 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 596 return false; 597 } 598 599 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 600 assert(Operand && 601 Operand->isReg() && 602 isSameReg(*Operand, *getReplacedOperand())); 603 copyRegOperand(*Operand, *getTargetOperand()); 604 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 605 assert(DstSel); 606 607 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm()); 608 DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value()); 609 610 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 611 assert(DstUnused); 612 DstUnused->setImm(getDstUnused()); 613 614 // Remove original instruction because it would conflict with our new 615 // instruction by register definition 616 getParentInst()->eraseFromParent(); 617 return true; 618 } 619 620 bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, 621 const SIInstrInfo *TII) { 622 if (!TII->isSDWA(MI.getOpcode())) 623 return true; 624 625 return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel()); 626 } 627 628 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 629 const SIInstrInfo *TII) { 630 // MI should be moved right before v_or_b32. 631 // For this we should clear all kill flags on uses of MI src-operands or else 632 // we can encounter problem with use of killed operand. 633 for (MachineOperand &MO : MI.uses()) { 634 if (!MO.isReg()) 635 continue; 636 getMRI()->clearKillFlags(MO.getReg()); 637 } 638 639 // Move MI before v_or_b32 640 MI.getParent()->remove(&MI); 641 getParentInst()->getParent()->insert(getParentInst(), &MI); 642 643 // Add Implicit use of preserved register 644 MachineInstrBuilder MIB(*MI.getMF(), MI); 645 MIB.addReg(getPreservedOperand()->getReg(), 646 RegState::ImplicitKill, 647 getPreservedOperand()->getSubReg()); 648 649 // Tie dst to implicit use 650 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 651 MI.getNumOperands() - 1); 652 653 // Convert MI as any other SDWADstOperand and remove v_or_b32 654 return SDWADstOperand::convertToSDWA(MI, TII); 655 } 656 657 bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, 658 const SIInstrInfo *TII) { 659 return SDWADstOperand::canCombineSelections(MI, TII); 660 } 661 662 std::optional<int64_t> 663 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 664 if (Op.isImm()) { 665 return Op.getImm(); 666 } 667 668 // If this is not immediate then it can be copy of immediate value, e.g.: 669 // %1 = S_MOV_B32 255; 670 if (Op.isReg()) { 671 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 672 if (!isSameReg(Op, Def)) 673 continue; 674 675 const MachineInstr *DefInst = Def.getParent(); 676 if (!TII->isFoldableCopy(*DefInst)) 677 return std::nullopt; 678 679 const MachineOperand &Copied = DefInst->getOperand(1); 680 if (!Copied.isImm()) 681 return std::nullopt; 682 683 return Copied.getImm(); 684 } 685 } 686 687 return std::nullopt; 688 } 689 690 std::unique_ptr<SDWAOperand> 691 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 692 unsigned Opcode = MI.getOpcode(); 693 switch (Opcode) { 694 case AMDGPU::V_LSHRREV_B32_e32: 695 case AMDGPU::V_ASHRREV_I32_e32: 696 case AMDGPU::V_LSHLREV_B32_e32: 697 case AMDGPU::V_LSHRREV_B32_e64: 698 case AMDGPU::V_ASHRREV_I32_e64: 699 case AMDGPU::V_LSHLREV_B32_e64: { 700 // from: v_lshrrev_b32_e32 v1, 16/24, v0 701 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 702 703 // from: v_ashrrev_i32_e32 v1, 16/24, v0 704 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 705 706 // from: v_lshlrev_b32_e32 v1, 16/24, v0 707 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 708 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 709 auto Imm = foldToImm(*Src0); 710 if (!Imm) 711 break; 712 713 if (*Imm != 16 && *Imm != 24) 714 break; 715 716 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 717 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 718 if (!Src1->isReg() || Src1->getReg().isPhysical() || 719 Dst->getReg().isPhysical()) 720 break; 721 722 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 723 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 724 return std::make_unique<SDWADstOperand>( 725 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 726 } 727 return std::make_unique<SDWASrcOperand>( 728 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 729 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 730 Opcode != AMDGPU::V_LSHRREV_B32_e64); 731 break; 732 } 733 734 case AMDGPU::V_LSHRREV_B16_e32: 735 case AMDGPU::V_ASHRREV_I16_e32: 736 case AMDGPU::V_LSHLREV_B16_e32: 737 case AMDGPU::V_LSHRREV_B16_e64: 738 case AMDGPU::V_LSHRREV_B16_opsel_e64: 739 case AMDGPU::V_ASHRREV_I16_e64: 740 case AMDGPU::V_LSHLREV_B16_opsel_e64: 741 case AMDGPU::V_LSHLREV_B16_e64: { 742 // from: v_lshrrev_b16_e32 v1, 8, v0 743 // to SDWA src:v0 src_sel:BYTE_1 744 745 // from: v_ashrrev_i16_e32 v1, 8, v0 746 // to SDWA src:v0 src_sel:BYTE_1 sext:1 747 748 // from: v_lshlrev_b16_e32 v1, 8, v0 749 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 750 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 751 auto Imm = foldToImm(*Src0); 752 if (!Imm || *Imm != 8) 753 break; 754 755 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 756 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 757 758 if (!Src1->isReg() || Src1->getReg().isPhysical() || 759 Dst->getReg().isPhysical()) 760 break; 761 762 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 763 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 || 764 Opcode == AMDGPU::V_LSHLREV_B16_e64) 765 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 766 return std::make_unique<SDWASrcOperand>( 767 Src1, Dst, BYTE_1, false, false, 768 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 769 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 && 770 Opcode != AMDGPU::V_LSHRREV_B16_e64); 771 break; 772 } 773 774 case AMDGPU::V_BFE_I32_e64: 775 case AMDGPU::V_BFE_U32_e64: { 776 // e.g.: 777 // from: v_bfe_u32 v1, v0, 8, 8 778 // to SDWA src:v0 src_sel:BYTE_1 779 780 // offset | width | src_sel 781 // ------------------------ 782 // 0 | 8 | BYTE_0 783 // 0 | 16 | WORD_0 784 // 0 | 32 | DWORD ? 785 // 8 | 8 | BYTE_1 786 // 16 | 8 | BYTE_2 787 // 16 | 16 | WORD_1 788 // 24 | 8 | BYTE_3 789 790 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 791 auto Offset = foldToImm(*Src1); 792 if (!Offset) 793 break; 794 795 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 796 auto Width = foldToImm(*Src2); 797 if (!Width) 798 break; 799 800 SdwaSel SrcSel = DWORD; 801 802 if (*Offset == 0 && *Width == 8) 803 SrcSel = BYTE_0; 804 else if (*Offset == 0 && *Width == 16) 805 SrcSel = WORD_0; 806 else if (*Offset == 0 && *Width == 32) 807 SrcSel = DWORD; 808 else if (*Offset == 8 && *Width == 8) 809 SrcSel = BYTE_1; 810 else if (*Offset == 16 && *Width == 8) 811 SrcSel = BYTE_2; 812 else if (*Offset == 16 && *Width == 16) 813 SrcSel = WORD_1; 814 else if (*Offset == 24 && *Width == 8) 815 SrcSel = BYTE_3; 816 else 817 break; 818 819 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 820 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 821 822 if (!Src0->isReg() || Src0->getReg().isPhysical() || 823 Dst->getReg().isPhysical()) 824 break; 825 826 return std::make_unique<SDWASrcOperand>( 827 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 828 } 829 830 case AMDGPU::V_AND_B32_e32: 831 case AMDGPU::V_AND_B32_e64: { 832 // e.g.: 833 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 834 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 835 836 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 837 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 838 auto *ValSrc = Src1; 839 auto Imm = foldToImm(*Src0); 840 841 if (!Imm) { 842 Imm = foldToImm(*Src1); 843 ValSrc = Src0; 844 } 845 846 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 847 break; 848 849 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 850 851 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || 852 Dst->getReg().isPhysical()) 853 break; 854 855 return std::make_unique<SDWASrcOperand>( 856 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 857 } 858 859 case AMDGPU::V_OR_B32_e32: 860 case AMDGPU::V_OR_B32_e64: { 861 // Patterns for dst_unused:UNUSED_PRESERVE. 862 // e.g., from: 863 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 864 // src1_sel:WORD_1 src2_sel:WORD1 865 // v_add_f16_e32 v3, v1, v2 866 // v_or_b32_e32 v4, v0, v3 867 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 868 869 // Check if one of operands of v_or_b32 is SDWA instruction 870 using CheckRetType = 871 std::optional<std::pair<MachineOperand *, MachineOperand *>>; 872 auto CheckOROperandsForSDWA = 873 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 874 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 875 return CheckRetType(std::nullopt); 876 877 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 878 if (!Op1Def) 879 return CheckRetType(std::nullopt); 880 881 MachineInstr *Op1Inst = Op1Def->getParent(); 882 if (!TII->isSDWA(*Op1Inst)) 883 return CheckRetType(std::nullopt); 884 885 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 886 if (!Op2Def) 887 return CheckRetType(std::nullopt); 888 889 return CheckRetType(std::pair(Op1Def, Op2Def)); 890 }; 891 892 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 893 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 894 assert(OrSDWA && OrOther); 895 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 896 if (!Res) { 897 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 898 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 899 assert(OrSDWA && OrOther); 900 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 901 if (!Res) 902 break; 903 } 904 905 MachineOperand *OrSDWADef = Res->first; 906 MachineOperand *OrOtherDef = Res->second; 907 assert(OrSDWADef && OrOtherDef); 908 909 MachineInstr *SDWAInst = OrSDWADef->getParent(); 910 MachineInstr *OtherInst = OrOtherDef->getParent(); 911 912 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 913 // destination patterns don't overlap. Compatible instruction can be either 914 // regular instruction with compatible bitness or SDWA instruction with 915 // correct dst_sel 916 // SDWAInst | OtherInst bitness / OtherInst dst_sel 917 // ----------------------------------------------------- 918 // DWORD | no / no 919 // WORD_0 | no / BYTE_2/3, WORD_1 920 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 921 // BYTE_0 | no / BYTE_1/2/3, WORD_1 922 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 923 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 924 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 925 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 926 // but v_add_f32 is not. 927 928 // TODO: add support for non-SDWA instructions as OtherInst. 929 // For now this only works with SDWA instructions. For regular instructions 930 // there is no way to determine if the instruction writes only 8/16/24-bit 931 // out of full register size and all registers are at min 32-bit wide. 932 if (!TII->isSDWA(*OtherInst)) 933 break; 934 935 SdwaSel DstSel = static_cast<SdwaSel>( 936 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); 937 SdwaSel OtherDstSel = static_cast<SdwaSel>( 938 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 939 940 bool DstSelAgree = false; 941 switch (DstSel) { 942 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 943 (OtherDstSel == BYTE_3) || 944 (OtherDstSel == WORD_1)); 945 break; 946 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 947 (OtherDstSel == BYTE_1) || 948 (OtherDstSel == WORD_0)); 949 break; 950 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 951 (OtherDstSel == BYTE_2) || 952 (OtherDstSel == BYTE_3) || 953 (OtherDstSel == WORD_1)); 954 break; 955 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 956 (OtherDstSel == BYTE_2) || 957 (OtherDstSel == BYTE_3) || 958 (OtherDstSel == WORD_1)); 959 break; 960 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 961 (OtherDstSel == BYTE_1) || 962 (OtherDstSel == BYTE_3) || 963 (OtherDstSel == WORD_0)); 964 break; 965 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 966 (OtherDstSel == BYTE_1) || 967 (OtherDstSel == BYTE_2) || 968 (OtherDstSel == WORD_0)); 969 break; 970 default: DstSelAgree = false; 971 } 972 973 if (!DstSelAgree) 974 break; 975 976 // Also OtherInst dst_unused should be UNUSED_PAD 977 DstUnused OtherDstUnused = static_cast<DstUnused>( 978 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 979 if (OtherDstUnused != DstUnused::UNUSED_PAD) 980 break; 981 982 // Create DstPreserveOperand 983 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 984 assert(OrDst && OrDst->isReg()); 985 986 return std::make_unique<SDWADstPreserveOperand>( 987 OrDst, OrSDWADef, OrOtherDef, DstSel); 988 989 } 990 } 991 992 return std::unique_ptr<SDWAOperand>(nullptr); 993 } 994 995 #if !defined(NDEBUG) 996 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 997 Operand.print(OS); 998 return OS; 999 } 1000 #endif 1001 1002 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 1003 for (MachineInstr &MI : MBB) { 1004 if (auto Operand = matchSDWAOperand(MI)) { 1005 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 1006 SDWAOperands[&MI] = std::move(Operand); 1007 ++NumSDWAPatternsFound; 1008 } 1009 } 1010 } 1011 1012 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows 1013 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into 1014 // V_ADD_CO_U32_sdwa. 1015 // 1016 // We are transforming from a VOP3 into a VOP2 form of the instruction. 1017 // %19:vgpr_32 = V_AND_B32_e32 255, 1018 // killed %16:vgpr_32, implicit $exec 1019 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 1020 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 1021 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 1022 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 1023 // 1024 // becomes 1025 // %47:vgpr_32 = V_ADD_CO_U32_sdwa 1026 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 1027 // implicit-def $vcc, implicit $exec 1028 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 1029 // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec 1030 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 1031 const GCNSubtarget &ST) const { 1032 int Opc = MI.getOpcode(); 1033 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 1034 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 1035 1036 // Can the candidate MI be shrunk? 1037 if (!TII->canShrink(MI, *MRI)) 1038 return; 1039 Opc = AMDGPU::getVOPe32(Opc); 1040 // Find the related ADD instruction. 1041 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 1042 if (!Sdst) 1043 return; 1044 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 1045 if (!NextOp) 1046 return; 1047 MachineInstr &MISucc = *NextOp->getParent(); 1048 1049 // Make sure the carry in/out are subsequently unused. 1050 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 1051 if (!CarryIn) 1052 return; 1053 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 1054 if (!CarryOut) 1055 return; 1056 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 1057 return; 1058 // Make sure VCC or its subregs are dead before MI. 1059 MachineBasicBlock &MBB = *MI.getParent(); 1060 MachineBasicBlock::LivenessQueryResult Liveness = 1061 MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 1062 if (Liveness != MachineBasicBlock::LQR_Dead) 1063 return; 1064 // Check if VCC is referenced in range of (MI,MISucc]. 1065 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 1066 I != E; ++I) { 1067 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 1068 return; 1069 } 1070 1071 // Replace MI with V_{SUB|ADD}_I32_e32 1072 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 1073 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 1074 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 1075 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 1076 .setMIFlags(MI.getFlags()); 1077 1078 MI.eraseFromParent(); 1079 1080 // Since the carry output of MI is now VCC, update its use in MISucc. 1081 1082 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); 1083 } 1084 1085 /// Try to convert an \p MI in VOP3 which takes an src2 carry-in 1086 /// operand into the corresponding VOP2 form which expects the 1087 /// argument in VCC. To this end, add an copy from the carry-in to 1088 /// VCC. The conversion will only be applied if \p MI can be shrunk 1089 /// to VOP2 and if VCC can be proven to be dead before \p MI. 1090 void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, 1091 const GCNSubtarget &ST) const { 1092 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); 1093 1094 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); 1095 if (!TII->canShrink(MI, *MRI)) { 1096 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n"); 1097 return; 1098 } 1099 1100 const MachineOperand &CarryIn = 1101 *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1102 Register CarryReg = CarryIn.getReg(); 1103 MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); 1104 if (!CarryDef) { 1105 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n"); 1106 return; 1107 } 1108 1109 // Make sure VCC or its subregs are dead before MI. 1110 MCRegister Vcc = TRI->getVCC(); 1111 MachineBasicBlock &MBB = *MI.getParent(); 1112 MachineBasicBlock::LivenessQueryResult Liveness = 1113 MBB.computeRegisterLiveness(TRI, Vcc, MI); 1114 if (Liveness != MachineBasicBlock::LQR_Dead) { 1115 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n"); 1116 return; 1117 } 1118 1119 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn); 1120 1121 auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), 1122 TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) 1123 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 1124 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 1125 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 1126 .setMIFlags(MI.getFlags()); 1127 TII->fixImplicitOperands(*Converted); 1128 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); 1129 (void)Converted; 1130 MI.eraseFromParent(); 1131 } 1132 1133 namespace { 1134 bool isConvertibleToSDWA(MachineInstr &MI, 1135 const GCNSubtarget &ST, 1136 const SIInstrInfo* TII) { 1137 // Check if this is already an SDWA instruction 1138 unsigned Opc = MI.getOpcode(); 1139 if (TII->isSDWA(Opc)) 1140 return true; 1141 1142 // Can only be handled after ealier conversion to 1143 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. 1144 if (Opc == AMDGPU::V_CNDMASK_B32_e64) 1145 return false; 1146 1147 // Check if this instruction has opcode that supports SDWA 1148 if (AMDGPU::getSDWAOp(Opc) == -1) 1149 Opc = AMDGPU::getVOPe32(Opc); 1150 1151 if (AMDGPU::getSDWAOp(Opc) == -1) 1152 return false; 1153 1154 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 1155 return false; 1156 1157 if (TII->isVOPC(Opc)) { 1158 if (!ST.hasSDWASdst()) { 1159 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 1160 if (SDst && (SDst->getReg() != AMDGPU::VCC && 1161 SDst->getReg() != AMDGPU::VCC_LO)) 1162 return false; 1163 } 1164 1165 if (!ST.hasSDWAOutModsVOPC() && 1166 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 1167 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 1168 return false; 1169 1170 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 1171 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1172 return false; 1173 } 1174 1175 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 1176 Opc == AMDGPU::V_FMAC_F32_e32 || 1177 Opc == AMDGPU::V_MAC_F16_e32 || 1178 Opc == AMDGPU::V_MAC_F32_e32)) 1179 return false; 1180 1181 // Check if target supports this SDWA opcode 1182 if (TII->pseudoToMCOpcode(Opc) == -1) 1183 return false; 1184 1185 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 1186 if (!Src0->isReg() && !Src0->isImm()) 1187 return false; 1188 } 1189 1190 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 1191 if (!Src1->isReg() && !Src1->isImm()) 1192 return false; 1193 } 1194 1195 return true; 1196 } 1197 } // namespace 1198 1199 MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { 1200 unsigned Opcode = MI.getOpcode(); 1201 assert(!TII->isSDWA(Opcode)); 1202 1203 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1204 if (SDWAOpcode == -1) 1205 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1206 assert(SDWAOpcode != -1); 1207 1208 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1209 1210 // Create SDWA version of instruction MI and initialize its operands 1211 MachineInstrBuilder SDWAInst = 1212 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 1213 .setMIFlags(MI.getFlags()); 1214 1215 // Copy dst, if it is present in original then should also be present in SDWA 1216 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1217 if (Dst) { 1218 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); 1219 SDWAInst.add(*Dst); 1220 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1221 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1222 SDWAInst.add(*Dst); 1223 } else { 1224 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1225 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1226 } 1227 1228 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1229 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1230 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1231 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && 1232 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); 1233 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1234 SDWAInst.addImm(Mod->getImm()); 1235 else 1236 SDWAInst.addImm(0); 1237 SDWAInst.add(*Src0); 1238 1239 // Copy src1 if present, initialize src1_modifiers. 1240 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1241 if (Src1) { 1242 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && 1243 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); 1244 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1245 SDWAInst.addImm(Mod->getImm()); 1246 else 1247 SDWAInst.addImm(0); 1248 SDWAInst.add(*Src1); 1249 } 1250 1251 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1252 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1253 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1254 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1255 // v_mac_f16/32 has additional src2 operand tied to vdst 1256 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1257 assert(Src2); 1258 SDWAInst.add(*Src2); 1259 } 1260 1261 // Copy clamp if present, initialize otherwise 1262 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); 1263 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1264 if (Clamp) { 1265 SDWAInst.add(*Clamp); 1266 } else { 1267 SDWAInst.addImm(0); 1268 } 1269 1270 // Copy omod if present, initialize otherwise if needed 1271 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) { 1272 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1273 if (OMod) { 1274 SDWAInst.add(*OMod); 1275 } else { 1276 SDWAInst.addImm(0); 1277 } 1278 } 1279 1280 // Initialize SDWA specific operands 1281 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) 1282 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1283 1284 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) 1285 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1286 1287 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); 1288 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1289 1290 if (Src1) { 1291 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); 1292 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1293 } 1294 1295 // Check for a preserved register that needs to be copied. 1296 MachineInstr *Ret = SDWAInst.getInstr(); 1297 TII->fixImplicitOperands(*Ret); 1298 return Ret; 1299 } 1300 1301 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 1302 const SDWAOperandsVector &SDWAOperands) { 1303 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 1304 1305 MachineInstr *SDWAInst; 1306 if (TII->isSDWA(MI.getOpcode())) { 1307 // Clone the instruction to allow revoking changes 1308 // made to MI during the processing of the operands 1309 // if the conversion fails. 1310 SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI); 1311 MI.getParent()->insert(MI.getIterator(), SDWAInst); 1312 } else { 1313 SDWAInst = createSDWAVersion(MI); 1314 } 1315 1316 // Apply all sdwa operand patterns. 1317 bool Converted = false; 1318 for (auto &Operand : SDWAOperands) { 1319 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1320 // There should be no intersection between SDWA operands and potential MIs 1321 // e.g.: 1322 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1323 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1324 // v_add_u32 v3, v4, v2 1325 // 1326 // In that example it is possible that we would fold 2nd instruction into 1327 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that 1328 // was already destroyed). So if SDWAOperand is also a potential MI then do 1329 // not apply it. 1330 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1331 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1332 } 1333 1334 if (!Converted) { 1335 SDWAInst->eraseFromParent(); 1336 return false; 1337 } 1338 1339 ConvertedInstructions.push_back(SDWAInst); 1340 for (MachineOperand &MO : SDWAInst->uses()) { 1341 if (!MO.isReg()) 1342 continue; 1343 1344 MRI->clearKillFlags(MO.getReg()); 1345 } 1346 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1347 ++NumSDWAInstructionsPeepholed; 1348 1349 MI.eraseFromParent(); 1350 return true; 1351 } 1352 1353 // If an instruction was converted to SDWA it should not have immediates or SGPR 1354 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1355 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1356 const GCNSubtarget &ST) const { 1357 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1358 unsigned ConstantBusCount = 0; 1359 for (MachineOperand &Op : MI.explicit_uses()) { 1360 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1361 continue; 1362 1363 unsigned I = Op.getOperandNo(); 1364 if (Desc.operands()[I].RegClass == -1 || 1365 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) 1366 continue; 1367 1368 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1369 TRI->isSGPRReg(*MRI, Op.getReg())) { 1370 ++ConstantBusCount; 1371 continue; 1372 } 1373 1374 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1375 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1376 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1377 if (Op.isImm()) 1378 Copy.addImm(Op.getImm()); 1379 else if (Op.isReg()) 1380 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1381 Op.getSubReg()); 1382 Op.ChangeToRegister(VGPR, false); 1383 } 1384 } 1385 1386 bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { 1387 if (skipFunction(MF.getFunction())) 1388 return false; 1389 1390 return SIPeepholeSDWA().run(MF); 1391 } 1392 1393 bool SIPeepholeSDWA::run(MachineFunction &MF) { 1394 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1395 1396 if (!ST.hasSDWA()) 1397 return false; 1398 1399 MRI = &MF.getRegInfo(); 1400 TRI = ST.getRegisterInfo(); 1401 TII = ST.getInstrInfo(); 1402 1403 // Find all SDWA operands in MF. 1404 bool Ret = false; 1405 for (MachineBasicBlock &MBB : MF) { 1406 bool Changed = false; 1407 do { 1408 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1409 // Look for a possible ADD or SUB that resulted from a previously lowered 1410 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1411 // lowers the pair of instructions into e32 form. 1412 matchSDWAOperands(MBB); 1413 for (const auto &OperandPair : SDWAOperands) { 1414 const auto &Operand = OperandPair.second; 1415 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); 1416 if (!PotentialMI) 1417 continue; 1418 1419 switch (PotentialMI->getOpcode()) { 1420 case AMDGPU::V_ADD_CO_U32_e64: 1421 case AMDGPU::V_SUB_CO_U32_e64: 1422 pseudoOpConvertToVOP2(*PotentialMI, ST); 1423 break; 1424 case AMDGPU::V_CNDMASK_B32_e64: 1425 convertVcndmaskToVOP2(*PotentialMI, ST); 1426 break; 1427 }; 1428 } 1429 SDWAOperands.clear(); 1430 1431 // Generate potential match list. 1432 matchSDWAOperands(MBB); 1433 1434 for (const auto &OperandPair : SDWAOperands) { 1435 const auto &Operand = OperandPair.second; 1436 MachineInstr *PotentialMI = 1437 Operand->potentialToConvert(TII, ST, &PotentialMatches); 1438 1439 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) 1440 PotentialMatches[PotentialMI].push_back(Operand.get()); 1441 } 1442 1443 for (auto &PotentialPair : PotentialMatches) { 1444 MachineInstr &PotentialMI = *PotentialPair.first; 1445 convertToSDWA(PotentialMI, PotentialPair.second); 1446 } 1447 1448 PotentialMatches.clear(); 1449 SDWAOperands.clear(); 1450 1451 Changed = !ConvertedInstructions.empty(); 1452 1453 if (Changed) 1454 Ret = true; 1455 while (!ConvertedInstructions.empty()) 1456 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1457 } while (Changed); 1458 } 1459 1460 return Ret; 1461 } 1462 1463 PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, 1464 MachineFunctionAnalysisManager &) { 1465 if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) 1466 return PreservedAnalyses::all(); 1467 1468 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); 1469 PA.preserveSet<CFGAnalyses>(); 1470 return PA; 1471 } 1472