1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_CO_U32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_CO_U32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "AMDGPU.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 25 #include "llvm/ADT/MapVector.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include <optional> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "si-peephole-sdwa" 33 34 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 35 STATISTIC(NumSDWAInstructionsPeepholed, 36 "Number of instruction converted to SDWA."); 37 38 namespace { 39 40 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, 41 const SIInstrInfo *TII); 42 class SDWAOperand; 43 class SDWADstOperand; 44 45 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 46 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; 47 48 class SIPeepholeSDWA : public MachineFunctionPass { 49 private: 50 MachineRegisterInfo *MRI; 51 const SIRegisterInfo *TRI; 52 const SIInstrInfo *TII; 53 54 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 55 SDWAOperandsMap PotentialMatches; 56 SmallVector<MachineInstr *, 8> ConvertedInstructions; 57 58 std::optional<int64_t> foldToImm(const MachineOperand &Op) const; 59 60 public: 61 static char ID; 62 63 SIPeepholeSDWA() : MachineFunctionPass(ID) { 64 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 65 } 66 67 bool runOnMachineFunction(MachineFunction &MF) override; 68 void matchSDWAOperands(MachineBasicBlock &MBB); 69 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 70 void pseudoOpConvertToVOP2(MachineInstr &MI, 71 const GCNSubtarget &ST) const; 72 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 73 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 74 75 StringRef getPassName() const override { return "SI Peephole SDWA"; } 76 77 void getAnalysisUsage(AnalysisUsage &AU) const override { 78 AU.setPreservesCFG(); 79 MachineFunctionPass::getAnalysisUsage(AU); 80 } 81 }; 82 83 class SDWAOperand { 84 private: 85 MachineOperand *Target; // Operand that would be used in converted instruction 86 MachineOperand *Replaced; // Operand that would be replace by Target 87 88 public: 89 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 90 : Target(TargetOp), Replaced(ReplacedOp) { 91 assert(Target->isReg()); 92 assert(Replaced->isReg()); 93 } 94 95 virtual ~SDWAOperand() = default; 96 97 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, 98 const GCNSubtarget &ST, 99 SDWAOperandsMap *PotentialMatches = nullptr) = 0; 100 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 101 102 MachineOperand *getTargetOperand() const { return Target; } 103 MachineOperand *getReplacedOperand() const { return Replaced; } 104 MachineInstr *getParentInst() const { return Target->getParent(); } 105 106 MachineRegisterInfo *getMRI() const { 107 return &getParentInst()->getParent()->getParent()->getRegInfo(); 108 } 109 110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 111 virtual void print(raw_ostream& OS) const = 0; 112 void dump() const { print(dbgs()); } 113 #endif 114 }; 115 116 using namespace AMDGPU::SDWA; 117 118 class SDWASrcOperand : public SDWAOperand { 119 private: 120 SdwaSel SrcSel; 121 bool Abs; 122 bool Neg; 123 bool Sext; 124 125 public: 126 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 127 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 128 bool Sext_ = false) 129 : SDWAOperand(TargetOp, ReplacedOp), 130 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 131 132 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 133 const GCNSubtarget &ST, 134 SDWAOperandsMap *PotentialMatches = nullptr) override; 135 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 136 137 SdwaSel getSrcSel() const { return SrcSel; } 138 bool getAbs() const { return Abs; } 139 bool getNeg() const { return Neg; } 140 bool getSext() const { return Sext; } 141 142 uint64_t getSrcMods(const SIInstrInfo *TII, 143 const MachineOperand *SrcOp) const; 144 145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 146 void print(raw_ostream& OS) const override; 147 #endif 148 }; 149 150 class SDWADstOperand : public SDWAOperand { 151 private: 152 SdwaSel DstSel; 153 DstUnused DstUn; 154 155 public: 156 157 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 158 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 159 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 160 161 MachineInstr *potentialToConvert(const SIInstrInfo *TII, 162 const GCNSubtarget &ST, 163 SDWAOperandsMap *PotentialMatches = nullptr) override; 164 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 165 166 SdwaSel getDstSel() const { return DstSel; } 167 DstUnused getDstUnused() const { return DstUn; } 168 169 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 170 void print(raw_ostream& OS) const override; 171 #endif 172 }; 173 174 class SDWADstPreserveOperand : public SDWADstOperand { 175 private: 176 MachineOperand *Preserve; 177 178 public: 179 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 180 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 181 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 182 Preserve(PreserveOp) {} 183 184 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 185 186 MachineOperand *getPreservedOperand() const { return Preserve; } 187 188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 189 void print(raw_ostream& OS) const override; 190 #endif 191 }; 192 193 } // end anonymous namespace 194 195 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 196 197 char SIPeepholeSDWA::ID = 0; 198 199 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 200 201 FunctionPass *llvm::createSIPeepholeSDWAPass() { 202 return new SIPeepholeSDWA(); 203 } 204 205 206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 207 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 208 switch(Sel) { 209 case BYTE_0: OS << "BYTE_0"; break; 210 case BYTE_1: OS << "BYTE_1"; break; 211 case BYTE_2: OS << "BYTE_2"; break; 212 case BYTE_3: OS << "BYTE_3"; break; 213 case WORD_0: OS << "WORD_0"; break; 214 case WORD_1: OS << "WORD_1"; break; 215 case DWORD: OS << "DWORD"; break; 216 } 217 return OS; 218 } 219 220 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 221 switch(Un) { 222 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 223 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 224 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 225 } 226 return OS; 227 } 228 229 LLVM_DUMP_METHOD 230 void SDWASrcOperand::print(raw_ostream& OS) const { 231 OS << "SDWA src: " << *getTargetOperand() 232 << " src_sel:" << getSrcSel() 233 << " abs:" << getAbs() << " neg:" << getNeg() 234 << " sext:" << getSext() << '\n'; 235 } 236 237 LLVM_DUMP_METHOD 238 void SDWADstOperand::print(raw_ostream& OS) const { 239 OS << "SDWA dst: " << *getTargetOperand() 240 << " dst_sel:" << getDstSel() 241 << " dst_unused:" << getDstUnused() << '\n'; 242 } 243 244 LLVM_DUMP_METHOD 245 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 246 OS << "SDWA preserve dst: " << *getTargetOperand() 247 << " dst_sel:" << getDstSel() 248 << " preserve:" << *getPreservedOperand() << '\n'; 249 } 250 251 #endif 252 253 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 254 assert(To.isReg() && From.isReg()); 255 To.setReg(From.getReg()); 256 To.setSubReg(From.getSubReg()); 257 To.setIsUndef(From.isUndef()); 258 if (To.isUse()) { 259 To.setIsKill(From.isKill()); 260 } else { 261 To.setIsDead(From.isDead()); 262 } 263 } 264 265 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 266 return LHS.isReg() && 267 RHS.isReg() && 268 LHS.getReg() == RHS.getReg() && 269 LHS.getSubReg() == RHS.getSubReg(); 270 } 271 272 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 273 const MachineRegisterInfo *MRI) { 274 if (!Reg->isReg() || !Reg->isDef()) 275 return nullptr; 276 277 MachineOperand *ResMO = nullptr; 278 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 279 // If there exist use of subreg of Reg then return nullptr 280 if (!isSameReg(UseMO, *Reg)) 281 return nullptr; 282 283 // Check that there is only one instruction that uses Reg 284 if (!ResMO) { 285 ResMO = &UseMO; 286 } else if (ResMO->getParent() != UseMO.getParent()) { 287 return nullptr; 288 } 289 } 290 291 return ResMO; 292 } 293 294 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 295 const MachineRegisterInfo *MRI) { 296 if (!Reg->isReg()) 297 return nullptr; 298 299 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 300 if (!DefInstr) 301 return nullptr; 302 303 for (auto &DefMO : DefInstr->defs()) { 304 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 305 return &DefMO; 306 } 307 308 // Ignore implicit defs. 309 return nullptr; 310 } 311 312 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 313 const MachineOperand *SrcOp) const { 314 uint64_t Mods = 0; 315 const auto *MI = SrcOp->getParent(); 316 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 317 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 318 Mods = Mod->getImm(); 319 } 320 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 321 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 322 Mods = Mod->getImm(); 323 } 324 } 325 if (Abs || Neg) { 326 assert(!Sext && 327 "Float and integer src modifiers can't be set simultaneously"); 328 Mods |= Abs ? SISrcMods::ABS : 0u; 329 Mods ^= Neg ? SISrcMods::NEG : 0u; 330 } else if (Sext) { 331 Mods |= SISrcMods::SEXT; 332 } 333 334 return Mods; 335 } 336 337 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, 338 const GCNSubtarget &ST, 339 SDWAOperandsMap *PotentialMatches) { 340 if (PotentialMatches != nullptr) { 341 // Fill out the map for all uses if all can be converted 342 MachineOperand *Reg = getReplacedOperand(); 343 if (!Reg->isReg() || !Reg->isDef()) 344 return nullptr; 345 346 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) 347 // Check that all instructions that use Reg can be converted 348 if (!isConvertibleToSDWA(UseMI, ST, TII)) 349 return nullptr; 350 351 // Now that it's guaranteed all uses are legal, iterate over the uses again 352 // to add them for later conversion. 353 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { 354 // Should not get a subregister here 355 assert(isSameReg(UseMO, *Reg)); 356 357 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; 358 MachineInstr *UseMI = UseMO.getParent(); 359 potentialMatchesMap[UseMI].push_back(this); 360 } 361 return nullptr; 362 } 363 364 // For SDWA src operand potential instruction is one that use register 365 // defined by parent instruction 366 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 367 if (!PotentialMO) 368 return nullptr; 369 370 return PotentialMO->getParent(); 371 } 372 373 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 374 switch (MI.getOpcode()) { 375 case AMDGPU::V_CVT_F32_FP8_sdwa: 376 case AMDGPU::V_CVT_F32_BF8_sdwa: 377 case AMDGPU::V_CVT_PK_F32_FP8_sdwa: 378 case AMDGPU::V_CVT_PK_F32_BF8_sdwa: 379 // Does not support input modifiers: noabs, noneg, nosext. 380 return false; 381 } 382 383 // Find operand in instruction that matches source operand and replace it with 384 // target operand. Set corresponding src_sel 385 bool IsPreserveSrc = false; 386 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 387 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 388 MachineOperand *SrcMods = 389 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 390 assert(Src && (Src->isReg() || Src->isImm())); 391 if (!isSameReg(*Src, *getReplacedOperand())) { 392 // If this is not src0 then it could be src1 393 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 394 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 395 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 396 397 if (!Src || 398 !isSameReg(*Src, *getReplacedOperand())) { 399 // It's possible this Src is a tied operand for 400 // UNUSED_PRESERVE, in which case we can either 401 // abandon the peephole attempt, or if legal we can 402 // copy the target operand into the tied slot 403 // if the preserve operation will effectively cause the same 404 // result by overwriting the rest of the dst. 405 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 406 MachineOperand *DstUnused = 407 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 408 409 if (Dst && 410 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 411 // This will work if the tied src is accessing WORD_0, and the dst is 412 // writing WORD_1. Modifiers don't matter because all the bits that 413 // would be impacted are being overwritten by the dst. 414 // Any other case will not work. 415 SdwaSel DstSel = static_cast<SdwaSel>( 416 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 417 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 418 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 419 IsPreserveSrc = true; 420 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 421 AMDGPU::OpName::vdst); 422 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 423 Src = &MI.getOperand(TiedIdx); 424 SrcSel = nullptr; 425 SrcMods = nullptr; 426 } else { 427 // Not legal to convert this src 428 return false; 429 } 430 } 431 } 432 assert(Src && Src->isReg()); 433 434 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 435 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 436 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 437 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 438 !isSameReg(*Src, *getReplacedOperand())) { 439 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 440 // src2. This is not allowed. 441 return false; 442 } 443 444 assert(isSameReg(*Src, *getReplacedOperand()) && 445 (IsPreserveSrc || (SrcSel && SrcMods))); 446 } 447 copyRegOperand(*Src, *getTargetOperand()); 448 if (!IsPreserveSrc) { 449 SrcSel->setImm(getSrcSel()); 450 SrcMods->setImm(getSrcMods(TII, Src)); 451 } 452 getTargetOperand()->setIsKill(false); 453 return true; 454 } 455 456 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, 457 const GCNSubtarget &ST, 458 SDWAOperandsMap *PotentialMatches) { 459 // For SDWA dst operand potential instruction is one that defines register 460 // that this operand uses 461 MachineRegisterInfo *MRI = getMRI(); 462 MachineInstr *ParentMI = getParentInst(); 463 464 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 465 if (!PotentialMO) 466 return nullptr; 467 468 // Check that ParentMI is the only instruction that uses replaced register 469 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 470 if (&UseInst != ParentMI) 471 return nullptr; 472 } 473 474 return PotentialMO->getParent(); 475 } 476 477 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 478 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 479 480 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 481 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 482 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 483 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 484 getDstSel() != AMDGPU::SDWA::DWORD) { 485 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 486 return false; 487 } 488 489 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 490 assert(Operand && 491 Operand->isReg() && 492 isSameReg(*Operand, *getReplacedOperand())); 493 copyRegOperand(*Operand, *getTargetOperand()); 494 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 495 assert(DstSel); 496 DstSel->setImm(getDstSel()); 497 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 498 assert(DstUnused); 499 DstUnused->setImm(getDstUnused()); 500 501 // Remove original instruction because it would conflict with our new 502 // instruction by register definition 503 getParentInst()->eraseFromParent(); 504 return true; 505 } 506 507 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 508 const SIInstrInfo *TII) { 509 // MI should be moved right before v_or_b32. 510 // For this we should clear all kill flags on uses of MI src-operands or else 511 // we can encounter problem with use of killed operand. 512 for (MachineOperand &MO : MI.uses()) { 513 if (!MO.isReg()) 514 continue; 515 getMRI()->clearKillFlags(MO.getReg()); 516 } 517 518 // Move MI before v_or_b32 519 MI.getParent()->remove(&MI); 520 getParentInst()->getParent()->insert(getParentInst(), &MI); 521 522 // Add Implicit use of preserved register 523 MachineInstrBuilder MIB(*MI.getMF(), MI); 524 MIB.addReg(getPreservedOperand()->getReg(), 525 RegState::ImplicitKill, 526 getPreservedOperand()->getSubReg()); 527 528 // Tie dst to implicit use 529 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 530 MI.getNumOperands() - 1); 531 532 // Convert MI as any other SDWADstOperand and remove v_or_b32 533 return SDWADstOperand::convertToSDWA(MI, TII); 534 } 535 536 std::optional<int64_t> 537 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 538 if (Op.isImm()) { 539 return Op.getImm(); 540 } 541 542 // If this is not immediate then it can be copy of immediate value, e.g.: 543 // %1 = S_MOV_B32 255; 544 if (Op.isReg()) { 545 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 546 if (!isSameReg(Op, Def)) 547 continue; 548 549 const MachineInstr *DefInst = Def.getParent(); 550 if (!TII->isFoldableCopy(*DefInst)) 551 return std::nullopt; 552 553 const MachineOperand &Copied = DefInst->getOperand(1); 554 if (!Copied.isImm()) 555 return std::nullopt; 556 557 return Copied.getImm(); 558 } 559 } 560 561 return std::nullopt; 562 } 563 564 std::unique_ptr<SDWAOperand> 565 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 566 unsigned Opcode = MI.getOpcode(); 567 switch (Opcode) { 568 case AMDGPU::V_LSHRREV_B32_e32: 569 case AMDGPU::V_ASHRREV_I32_e32: 570 case AMDGPU::V_LSHLREV_B32_e32: 571 case AMDGPU::V_LSHRREV_B32_e64: 572 case AMDGPU::V_ASHRREV_I32_e64: 573 case AMDGPU::V_LSHLREV_B32_e64: { 574 // from: v_lshrrev_b32_e32 v1, 16/24, v0 575 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 576 577 // from: v_ashrrev_i32_e32 v1, 16/24, v0 578 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 579 580 // from: v_lshlrev_b32_e32 v1, 16/24, v0 581 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 582 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 583 auto Imm = foldToImm(*Src0); 584 if (!Imm) 585 break; 586 587 if (*Imm != 16 && *Imm != 24) 588 break; 589 590 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 591 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 592 if (!Src1->isReg() || Src1->getReg().isPhysical() || 593 Dst->getReg().isPhysical()) 594 break; 595 596 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 597 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 598 return std::make_unique<SDWADstOperand>( 599 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 600 } 601 return std::make_unique<SDWASrcOperand>( 602 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 603 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 604 Opcode != AMDGPU::V_LSHRREV_B32_e64); 605 break; 606 } 607 608 case AMDGPU::V_LSHRREV_B16_e32: 609 case AMDGPU::V_ASHRREV_I16_e32: 610 case AMDGPU::V_LSHLREV_B16_e32: 611 case AMDGPU::V_LSHRREV_B16_e64: 612 case AMDGPU::V_ASHRREV_I16_e64: 613 case AMDGPU::V_LSHLREV_B16_e64: { 614 // from: v_lshrrev_b16_e32 v1, 8, v0 615 // to SDWA src:v0 src_sel:BYTE_1 616 617 // from: v_ashrrev_i16_e32 v1, 8, v0 618 // to SDWA src:v0 src_sel:BYTE_1 sext:1 619 620 // from: v_lshlrev_b16_e32 v1, 8, v0 621 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 622 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 623 auto Imm = foldToImm(*Src0); 624 if (!Imm || *Imm != 8) 625 break; 626 627 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 628 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 629 630 if (!Src1->isReg() || Src1->getReg().isPhysical() || 631 Dst->getReg().isPhysical()) 632 break; 633 634 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 635 Opcode == AMDGPU::V_LSHLREV_B16_e64) 636 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 637 return std::make_unique<SDWASrcOperand>( 638 Src1, Dst, BYTE_1, false, false, 639 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 640 Opcode != AMDGPU::V_LSHRREV_B16_e64); 641 break; 642 } 643 644 case AMDGPU::V_BFE_I32_e64: 645 case AMDGPU::V_BFE_U32_e64: { 646 // e.g.: 647 // from: v_bfe_u32 v1, v0, 8, 8 648 // to SDWA src:v0 src_sel:BYTE_1 649 650 // offset | width | src_sel 651 // ------------------------ 652 // 0 | 8 | BYTE_0 653 // 0 | 16 | WORD_0 654 // 0 | 32 | DWORD ? 655 // 8 | 8 | BYTE_1 656 // 16 | 8 | BYTE_2 657 // 16 | 16 | WORD_1 658 // 24 | 8 | BYTE_3 659 660 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 661 auto Offset = foldToImm(*Src1); 662 if (!Offset) 663 break; 664 665 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 666 auto Width = foldToImm(*Src2); 667 if (!Width) 668 break; 669 670 SdwaSel SrcSel = DWORD; 671 672 if (*Offset == 0 && *Width == 8) 673 SrcSel = BYTE_0; 674 else if (*Offset == 0 && *Width == 16) 675 SrcSel = WORD_0; 676 else if (*Offset == 0 && *Width == 32) 677 SrcSel = DWORD; 678 else if (*Offset == 8 && *Width == 8) 679 SrcSel = BYTE_1; 680 else if (*Offset == 16 && *Width == 8) 681 SrcSel = BYTE_2; 682 else if (*Offset == 16 && *Width == 16) 683 SrcSel = WORD_1; 684 else if (*Offset == 24 && *Width == 8) 685 SrcSel = BYTE_3; 686 else 687 break; 688 689 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 690 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 691 692 if (!Src0->isReg() || Src0->getReg().isPhysical() || 693 Dst->getReg().isPhysical()) 694 break; 695 696 return std::make_unique<SDWASrcOperand>( 697 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 698 } 699 700 case AMDGPU::V_AND_B32_e32: 701 case AMDGPU::V_AND_B32_e64: { 702 // e.g.: 703 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 704 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 705 706 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 707 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 708 auto ValSrc = Src1; 709 auto Imm = foldToImm(*Src0); 710 711 if (!Imm) { 712 Imm = foldToImm(*Src1); 713 ValSrc = Src0; 714 } 715 716 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 717 break; 718 719 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 720 721 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || 722 Dst->getReg().isPhysical()) 723 break; 724 725 return std::make_unique<SDWASrcOperand>( 726 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 727 } 728 729 case AMDGPU::V_OR_B32_e32: 730 case AMDGPU::V_OR_B32_e64: { 731 // Patterns for dst_unused:UNUSED_PRESERVE. 732 // e.g., from: 733 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 734 // src1_sel:WORD_1 src2_sel:WORD1 735 // v_add_f16_e32 v3, v1, v2 736 // v_or_b32_e32 v4, v0, v3 737 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 738 739 // Check if one of operands of v_or_b32 is SDWA instruction 740 using CheckRetType = 741 std::optional<std::pair<MachineOperand *, MachineOperand *>>; 742 auto CheckOROperandsForSDWA = 743 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 744 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 745 return CheckRetType(std::nullopt); 746 747 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 748 if (!Op1Def) 749 return CheckRetType(std::nullopt); 750 751 MachineInstr *Op1Inst = Op1Def->getParent(); 752 if (!TII->isSDWA(*Op1Inst)) 753 return CheckRetType(std::nullopt); 754 755 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 756 if (!Op2Def) 757 return CheckRetType(std::nullopt); 758 759 return CheckRetType(std::pair(Op1Def, Op2Def)); 760 }; 761 762 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 763 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 764 assert(OrSDWA && OrOther); 765 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 766 if (!Res) { 767 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 768 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 769 assert(OrSDWA && OrOther); 770 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 771 if (!Res) 772 break; 773 } 774 775 MachineOperand *OrSDWADef = Res->first; 776 MachineOperand *OrOtherDef = Res->second; 777 assert(OrSDWADef && OrOtherDef); 778 779 MachineInstr *SDWAInst = OrSDWADef->getParent(); 780 MachineInstr *OtherInst = OrOtherDef->getParent(); 781 782 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 783 // destination patterns don't overlap. Compatible instruction can be either 784 // regular instruction with compatible bitness or SDWA instruction with 785 // correct dst_sel 786 // SDWAInst | OtherInst bitness / OtherInst dst_sel 787 // ----------------------------------------------------- 788 // DWORD | no / no 789 // WORD_0 | no / BYTE_2/3, WORD_1 790 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 791 // BYTE_0 | no / BYTE_1/2/3, WORD_1 792 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 793 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 794 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 795 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 796 // but v_add_f32 is not. 797 798 // TODO: add support for non-SDWA instructions as OtherInst. 799 // For now this only works with SDWA instructions. For regular instructions 800 // there is no way to determine if the instruction writes only 8/16/24-bit 801 // out of full register size and all registers are at min 32-bit wide. 802 if (!TII->isSDWA(*OtherInst)) 803 break; 804 805 SdwaSel DstSel = static_cast<SdwaSel>( 806 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); 807 SdwaSel OtherDstSel = static_cast<SdwaSel>( 808 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 809 810 bool DstSelAgree = false; 811 switch (DstSel) { 812 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 813 (OtherDstSel == BYTE_3) || 814 (OtherDstSel == WORD_1)); 815 break; 816 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 817 (OtherDstSel == BYTE_1) || 818 (OtherDstSel == WORD_0)); 819 break; 820 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 821 (OtherDstSel == BYTE_2) || 822 (OtherDstSel == BYTE_3) || 823 (OtherDstSel == WORD_1)); 824 break; 825 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 826 (OtherDstSel == BYTE_2) || 827 (OtherDstSel == BYTE_3) || 828 (OtherDstSel == WORD_1)); 829 break; 830 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 831 (OtherDstSel == BYTE_1) || 832 (OtherDstSel == BYTE_3) || 833 (OtherDstSel == WORD_0)); 834 break; 835 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 836 (OtherDstSel == BYTE_1) || 837 (OtherDstSel == BYTE_2) || 838 (OtherDstSel == WORD_0)); 839 break; 840 default: DstSelAgree = false; 841 } 842 843 if (!DstSelAgree) 844 break; 845 846 // Also OtherInst dst_unused should be UNUSED_PAD 847 DstUnused OtherDstUnused = static_cast<DstUnused>( 848 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 849 if (OtherDstUnused != DstUnused::UNUSED_PAD) 850 break; 851 852 // Create DstPreserveOperand 853 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 854 assert(OrDst && OrDst->isReg()); 855 856 return std::make_unique<SDWADstPreserveOperand>( 857 OrDst, OrSDWADef, OrOtherDef, DstSel); 858 859 } 860 } 861 862 return std::unique_ptr<SDWAOperand>(nullptr); 863 } 864 865 #if !defined(NDEBUG) 866 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 867 Operand.print(OS); 868 return OS; 869 } 870 #endif 871 872 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 873 for (MachineInstr &MI : MBB) { 874 if (auto Operand = matchSDWAOperand(MI)) { 875 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 876 SDWAOperands[&MI] = std::move(Operand); 877 ++NumSDWAPatternsFound; 878 } 879 } 880 } 881 882 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows 883 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into 884 // V_ADD_CO_U32_sdwa. 885 // 886 // We are transforming from a VOP3 into a VOP2 form of the instruction. 887 // %19:vgpr_32 = V_AND_B32_e32 255, 888 // killed %16:vgpr_32, implicit $exec 889 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 890 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 891 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 892 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 893 // 894 // becomes 895 // %47:vgpr_32 = V_ADD_CO_U32_sdwa 896 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 897 // implicit-def $vcc, implicit $exec 898 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 899 // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec 900 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 901 const GCNSubtarget &ST) const { 902 int Opc = MI.getOpcode(); 903 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 904 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 905 906 // Can the candidate MI be shrunk? 907 if (!TII->canShrink(MI, *MRI)) 908 return; 909 Opc = AMDGPU::getVOPe32(Opc); 910 // Find the related ADD instruction. 911 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 912 if (!Sdst) 913 return; 914 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 915 if (!NextOp) 916 return; 917 MachineInstr &MISucc = *NextOp->getParent(); 918 919 // Make sure the carry in/out are subsequently unused. 920 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 921 if (!CarryIn) 922 return; 923 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 924 if (!CarryOut) 925 return; 926 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 927 return; 928 // Make sure VCC or its subregs are dead before MI. 929 MachineBasicBlock &MBB = *MI.getParent(); 930 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 931 if (Liveness != MachineBasicBlock::LQR_Dead) 932 return; 933 // Check if VCC is referenced in range of (MI,MISucc]. 934 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 935 I != E; ++I) { 936 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 937 return; 938 } 939 940 // Replace MI with V_{SUB|ADD}_I32_e32 941 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 942 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 943 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 944 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 945 .setMIFlags(MI.getFlags()); 946 947 MI.eraseFromParent(); 948 949 // Since the carry output of MI is now VCC, update its use in MISucc. 950 951 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); 952 } 953 954 namespace { 955 bool isConvertibleToSDWA(MachineInstr &MI, 956 const GCNSubtarget &ST, 957 const SIInstrInfo* TII) { 958 // Check if this is already an SDWA instruction 959 unsigned Opc = MI.getOpcode(); 960 if (TII->isSDWA(Opc)) 961 return true; 962 963 // Check if this instruction has opcode that supports SDWA 964 if (AMDGPU::getSDWAOp(Opc) == -1) 965 Opc = AMDGPU::getVOPe32(Opc); 966 967 if (AMDGPU::getSDWAOp(Opc) == -1) 968 return false; 969 970 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 971 return false; 972 973 if (TII->isVOPC(Opc)) { 974 if (!ST.hasSDWASdst()) { 975 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 976 if (SDst && (SDst->getReg() != AMDGPU::VCC && 977 SDst->getReg() != AMDGPU::VCC_LO)) 978 return false; 979 } 980 981 if (!ST.hasSDWAOutModsVOPC() && 982 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 983 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 984 return false; 985 986 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 987 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 988 return false; 989 } 990 991 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 992 Opc == AMDGPU::V_FMAC_F32_e32 || 993 Opc == AMDGPU::V_MAC_F16_e32 || 994 Opc == AMDGPU::V_MAC_F32_e32)) 995 return false; 996 997 // Check if target supports this SDWA opcode 998 if (TII->pseudoToMCOpcode(Opc) == -1) 999 return false; 1000 1001 // FIXME: has SDWA but require handling of implicit VCC use 1002 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 1003 return false; 1004 1005 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 1006 if (!Src0->isReg() && !Src0->isImm()) 1007 return false; 1008 } 1009 1010 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 1011 if (!Src1->isReg() && !Src1->isImm()) 1012 return false; 1013 } 1014 1015 return true; 1016 } 1017 } // namespace 1018 1019 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 1020 const SDWAOperandsVector &SDWAOperands) { 1021 1022 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 1023 1024 // Convert to sdwa 1025 int SDWAOpcode; 1026 unsigned Opcode = MI.getOpcode(); 1027 if (TII->isSDWA(Opcode)) { 1028 SDWAOpcode = Opcode; 1029 } else { 1030 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1031 if (SDWAOpcode == -1) 1032 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1033 } 1034 assert(SDWAOpcode != -1); 1035 1036 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1037 1038 // Create SDWA version of instruction MI and initialize its operands 1039 MachineInstrBuilder SDWAInst = 1040 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 1041 .setMIFlags(MI.getFlags()); 1042 1043 // Copy dst, if it is present in original then should also be present in SDWA 1044 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1045 if (Dst) { 1046 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); 1047 SDWAInst.add(*Dst); 1048 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1049 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1050 SDWAInst.add(*Dst); 1051 } else { 1052 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); 1053 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1054 } 1055 1056 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1057 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1058 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1059 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && 1060 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); 1061 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1062 SDWAInst.addImm(Mod->getImm()); 1063 else 1064 SDWAInst.addImm(0); 1065 SDWAInst.add(*Src0); 1066 1067 // Copy src1 if present, initialize src1_modifiers. 1068 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1069 if (Src1) { 1070 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && 1071 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); 1072 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1073 SDWAInst.addImm(Mod->getImm()); 1074 else 1075 SDWAInst.addImm(0); 1076 SDWAInst.add(*Src1); 1077 } 1078 1079 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1080 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1081 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1082 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1083 // v_mac_f16/32 has additional src2 operand tied to vdst 1084 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1085 assert(Src2); 1086 SDWAInst.add(*Src2); 1087 } 1088 1089 // Copy clamp if present, initialize otherwise 1090 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); 1091 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1092 if (Clamp) { 1093 SDWAInst.add(*Clamp); 1094 } else { 1095 SDWAInst.addImm(0); 1096 } 1097 1098 // Copy omod if present, initialize otherwise if needed 1099 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) { 1100 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1101 if (OMod) { 1102 SDWAInst.add(*OMod); 1103 } else { 1104 SDWAInst.addImm(0); 1105 } 1106 } 1107 1108 // Copy dst_sel if present, initialize otherwise if needed 1109 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) { 1110 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1111 if (DstSel) { 1112 SDWAInst.add(*DstSel); 1113 } else { 1114 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1115 } 1116 } 1117 1118 // Copy dst_unused if present, initialize otherwise if needed 1119 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) { 1120 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1121 if (DstUnused) { 1122 SDWAInst.add(*DstUnused); 1123 } else { 1124 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1125 } 1126 } 1127 1128 // Copy src0_sel if present, initialize otherwise 1129 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); 1130 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1131 if (Src0Sel) { 1132 SDWAInst.add(*Src0Sel); 1133 } else { 1134 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1135 } 1136 1137 // Copy src1_sel if present, initialize otherwise if needed 1138 if (Src1) { 1139 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); 1140 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1141 if (Src1Sel) { 1142 SDWAInst.add(*Src1Sel); 1143 } else { 1144 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1145 } 1146 } 1147 1148 // Check for a preserved register that needs to be copied. 1149 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1150 if (DstUnused && 1151 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1152 // We expect, if we are here, that the instruction was already in it's SDWA form, 1153 // with a tied operand. 1154 assert(Dst && Dst->isTied()); 1155 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1156 // We also expect a vdst, since sdst can't preserve. 1157 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1158 assert(PreserveDstIdx != -1); 1159 1160 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1161 auto Tied = MI.getOperand(TiedIdx); 1162 1163 SDWAInst.add(Tied); 1164 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1165 } 1166 1167 // Apply all sdwa operand patterns. 1168 bool Converted = false; 1169 for (auto &Operand : SDWAOperands) { 1170 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1171 // There should be no intersection between SDWA operands and potential MIs 1172 // e.g.: 1173 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1174 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1175 // v_add_u32 v3, v4, v2 1176 // 1177 // In that example it is possible that we would fold 2nd instruction into 1178 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that 1179 // was already destroyed). So if SDWAOperand is also a potential MI then do 1180 // not apply it. 1181 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1182 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1183 } 1184 1185 if (Converted) { 1186 ConvertedInstructions.push_back(SDWAInst); 1187 for (MachineOperand &MO : SDWAInst->uses()) { 1188 if (!MO.isReg()) 1189 continue; 1190 1191 MRI->clearKillFlags(MO.getReg()); 1192 } 1193 } else { 1194 SDWAInst->eraseFromParent(); 1195 return false; 1196 } 1197 1198 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1199 ++NumSDWAInstructionsPeepholed; 1200 1201 MI.eraseFromParent(); 1202 return true; 1203 } 1204 1205 // If an instruction was converted to SDWA it should not have immediates or SGPR 1206 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1207 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1208 const GCNSubtarget &ST) const { 1209 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1210 unsigned ConstantBusCount = 0; 1211 for (MachineOperand &Op : MI.explicit_uses()) { 1212 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1213 continue; 1214 1215 unsigned I = Op.getOperandNo(); 1216 if (Desc.operands()[I].RegClass == -1 || 1217 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) 1218 continue; 1219 1220 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1221 TRI->isSGPRReg(*MRI, Op.getReg())) { 1222 ++ConstantBusCount; 1223 continue; 1224 } 1225 1226 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1227 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1228 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1229 if (Op.isImm()) 1230 Copy.addImm(Op.getImm()); 1231 else if (Op.isReg()) 1232 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1233 Op.getSubReg()); 1234 Op.ChangeToRegister(VGPR, false); 1235 } 1236 } 1237 1238 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1239 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1240 1241 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1242 return false; 1243 1244 MRI = &MF.getRegInfo(); 1245 TRI = ST.getRegisterInfo(); 1246 TII = ST.getInstrInfo(); 1247 1248 // Find all SDWA operands in MF. 1249 bool Ret = false; 1250 for (MachineBasicBlock &MBB : MF) { 1251 bool Changed = false; 1252 do { 1253 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1254 // Look for a possible ADD or SUB that resulted from a previously lowered 1255 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1256 // lowers the pair of instructions into e32 form. 1257 matchSDWAOperands(MBB); 1258 for (const auto &OperandPair : SDWAOperands) { 1259 const auto &Operand = OperandPair.second; 1260 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); 1261 if (PotentialMI && 1262 (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 1263 PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) 1264 pseudoOpConvertToVOP2(*PotentialMI, ST); 1265 } 1266 SDWAOperands.clear(); 1267 1268 // Generate potential match list. 1269 matchSDWAOperands(MBB); 1270 1271 for (const auto &OperandPair : SDWAOperands) { 1272 const auto &Operand = OperandPair.second; 1273 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); 1274 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { 1275 PotentialMatches[PotentialMI].push_back(Operand.get()); 1276 } 1277 } 1278 1279 for (auto &PotentialPair : PotentialMatches) { 1280 MachineInstr &PotentialMI = *PotentialPair.first; 1281 convertToSDWA(PotentialMI, PotentialPair.second); 1282 } 1283 1284 PotentialMatches.clear(); 1285 SDWAOperands.clear(); 1286 1287 Changed = !ConvertedInstructions.empty(); 1288 1289 if (Changed) 1290 Ret = true; 1291 while (!ConvertedInstructions.empty()) 1292 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1293 } while (Changed); 1294 } 1295 1296 return Ret; 1297 } 1298