1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_I32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_I32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "AMDGPU.h" 23 #include "AMDGPUSubtarget.h" 24 #include "SIDefines.h" 25 #include "SIInstrInfo.h" 26 #include "SIRegisterInfo.h" 27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 28 #include "Utils/AMDGPUBaseInfo.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/Optional.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/CodeGen/MachineBasicBlock.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineInstr.h" 38 #include "llvm/CodeGen/MachineInstrBuilder.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/TargetRegisterInfo.h" 42 #include "llvm/Config/llvm-config.h" 43 #include "llvm/MC/LaneBitmask.h" 44 #include "llvm/MC/MCInstrDesc.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include <algorithm> 49 #include <cassert> 50 #include <cstdint> 51 #include <memory> 52 #include <unordered_map> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "si-peephole-sdwa" 57 58 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 59 STATISTIC(NumSDWAInstructionsPeepholed, 60 "Number of instruction converted to SDWA."); 61 62 namespace { 63 64 class SDWAOperand; 65 class SDWADstOperand; 66 67 class SIPeepholeSDWA : public MachineFunctionPass { 68 public: 69 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 70 71 private: 72 MachineRegisterInfo *MRI; 73 const SIRegisterInfo *TRI; 74 const SIInstrInfo *TII; 75 76 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 77 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 78 SmallVector<MachineInstr *, 8> ConvertedInstructions; 79 80 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 81 82 public: 83 static char ID; 84 85 SIPeepholeSDWA() : MachineFunctionPass(ID) { 86 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 87 } 88 89 bool runOnMachineFunction(MachineFunction &MF) override; 90 void matchSDWAOperands(MachineBasicBlock &MBB); 91 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 92 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 93 void pseudoOpConvertToVOP2(MachineInstr &MI, 94 const GCNSubtarget &ST) const; 95 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 96 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 97 98 StringRef getPassName() const override { return "SI Peephole SDWA"; } 99 100 void getAnalysisUsage(AnalysisUsage &AU) const override { 101 AU.setPreservesCFG(); 102 MachineFunctionPass::getAnalysisUsage(AU); 103 } 104 }; 105 106 class SDWAOperand { 107 private: 108 MachineOperand *Target; // Operand that would be used in converted instruction 109 MachineOperand *Replaced; // Operand that would be replace by Target 110 111 public: 112 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 113 : Target(TargetOp), Replaced(ReplacedOp) { 114 assert(Target->isReg()); 115 assert(Replaced->isReg()); 116 } 117 118 virtual ~SDWAOperand() = default; 119 120 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 121 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 122 123 MachineOperand *getTargetOperand() const { return Target; } 124 MachineOperand *getReplacedOperand() const { return Replaced; } 125 MachineInstr *getParentInst() const { return Target->getParent(); } 126 127 MachineRegisterInfo *getMRI() const { 128 return &getParentInst()->getParent()->getParent()->getRegInfo(); 129 } 130 131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 132 virtual void print(raw_ostream& OS) const = 0; 133 void dump() const { print(dbgs()); } 134 #endif 135 }; 136 137 using namespace AMDGPU::SDWA; 138 139 class SDWASrcOperand : public SDWAOperand { 140 private: 141 SdwaSel SrcSel; 142 bool Abs; 143 bool Neg; 144 bool Sext; 145 146 public: 147 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 148 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 149 bool Sext_ = false) 150 : SDWAOperand(TargetOp, ReplacedOp), 151 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 152 153 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 154 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 155 156 SdwaSel getSrcSel() const { return SrcSel; } 157 bool getAbs() const { return Abs; } 158 bool getNeg() const { return Neg; } 159 bool getSext() const { return Sext; } 160 161 uint64_t getSrcMods(const SIInstrInfo *TII, 162 const MachineOperand *SrcOp) const; 163 164 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 165 void print(raw_ostream& OS) const override; 166 #endif 167 }; 168 169 class SDWADstOperand : public SDWAOperand { 170 private: 171 SdwaSel DstSel; 172 DstUnused DstUn; 173 174 public: 175 176 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 177 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 178 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 179 180 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 181 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 182 183 SdwaSel getDstSel() const { return DstSel; } 184 DstUnused getDstUnused() const { return DstUn; } 185 186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 187 void print(raw_ostream& OS) const override; 188 #endif 189 }; 190 191 class SDWADstPreserveOperand : public SDWADstOperand { 192 private: 193 MachineOperand *Preserve; 194 195 public: 196 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 197 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 198 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 199 Preserve(PreserveOp) {} 200 201 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 202 203 MachineOperand *getPreservedOperand() const { return Preserve; } 204 205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 206 void print(raw_ostream& OS) const override; 207 #endif 208 }; 209 210 } // end anonymous namespace 211 212 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 213 214 char SIPeepholeSDWA::ID = 0; 215 216 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 217 218 FunctionPass *llvm::createSIPeepholeSDWAPass() { 219 return new SIPeepholeSDWA(); 220 } 221 222 223 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 224 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 225 switch(Sel) { 226 case BYTE_0: OS << "BYTE_0"; break; 227 case BYTE_1: OS << "BYTE_1"; break; 228 case BYTE_2: OS << "BYTE_2"; break; 229 case BYTE_3: OS << "BYTE_3"; break; 230 case WORD_0: OS << "WORD_0"; break; 231 case WORD_1: OS << "WORD_1"; break; 232 case DWORD: OS << "DWORD"; break; 233 } 234 return OS; 235 } 236 237 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 238 switch(Un) { 239 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 242 } 243 return OS; 244 } 245 246 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 247 Operand.print(OS); 248 return OS; 249 } 250 251 LLVM_DUMP_METHOD 252 void SDWASrcOperand::print(raw_ostream& OS) const { 253 OS << "SDWA src: " << *getTargetOperand() 254 << " src_sel:" << getSrcSel() 255 << " abs:" << getAbs() << " neg:" << getNeg() 256 << " sext:" << getSext() << '\n'; 257 } 258 259 LLVM_DUMP_METHOD 260 void SDWADstOperand::print(raw_ostream& OS) const { 261 OS << "SDWA dst: " << *getTargetOperand() 262 << " dst_sel:" << getDstSel() 263 << " dst_unused:" << getDstUnused() << '\n'; 264 } 265 266 LLVM_DUMP_METHOD 267 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 268 OS << "SDWA preserve dst: " << *getTargetOperand() 269 << " dst_sel:" << getDstSel() 270 << " preserve:" << *getPreservedOperand() << '\n'; 271 } 272 273 #endif 274 275 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 276 assert(To.isReg() && From.isReg()); 277 To.setReg(From.getReg()); 278 To.setSubReg(From.getSubReg()); 279 To.setIsUndef(From.isUndef()); 280 if (To.isUse()) { 281 To.setIsKill(From.isKill()); 282 } else { 283 To.setIsDead(From.isDead()); 284 } 285 } 286 287 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 288 return LHS.isReg() && 289 RHS.isReg() && 290 LHS.getReg() == RHS.getReg() && 291 LHS.getSubReg() == RHS.getSubReg(); 292 } 293 294 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 295 const MachineRegisterInfo *MRI) { 296 if (!Reg->isReg() || !Reg->isDef()) 297 return nullptr; 298 299 MachineOperand *ResMO = nullptr; 300 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 301 // If there exist use of subreg of Reg then return nullptr 302 if (!isSameReg(UseMO, *Reg)) 303 return nullptr; 304 305 // Check that there is only one instruction that uses Reg 306 if (!ResMO) { 307 ResMO = &UseMO; 308 } else if (ResMO->getParent() != UseMO.getParent()) { 309 return nullptr; 310 } 311 } 312 313 return ResMO; 314 } 315 316 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 317 const MachineRegisterInfo *MRI) { 318 if (!Reg->isReg()) 319 return nullptr; 320 321 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 322 if (!DefInstr) 323 return nullptr; 324 325 for (auto &DefMO : DefInstr->defs()) { 326 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 327 return &DefMO; 328 } 329 330 // Ignore implicit defs. 331 return nullptr; 332 } 333 334 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 335 const MachineOperand *SrcOp) const { 336 uint64_t Mods = 0; 337 const auto *MI = SrcOp->getParent(); 338 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 339 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 340 Mods = Mod->getImm(); 341 } 342 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 343 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 344 Mods = Mod->getImm(); 345 } 346 } 347 if (Abs || Neg) { 348 assert(!Sext && 349 "Float and integer src modifiers can't be set simulteniously"); 350 Mods |= Abs ? SISrcMods::ABS : 0u; 351 Mods ^= Neg ? SISrcMods::NEG : 0u; 352 } else if (Sext) { 353 Mods |= SISrcMods::SEXT; 354 } 355 356 return Mods; 357 } 358 359 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 360 // For SDWA src operand potential instruction is one that use register 361 // defined by parent instruction 362 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 363 if (!PotentialMO) 364 return nullptr; 365 366 return PotentialMO->getParent(); 367 } 368 369 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 370 // Find operand in instruction that matches source operand and replace it with 371 // target operand. Set corresponding src_sel 372 bool IsPreserveSrc = false; 373 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 374 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 375 MachineOperand *SrcMods = 376 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 377 assert(Src && (Src->isReg() || Src->isImm())); 378 if (!isSameReg(*Src, *getReplacedOperand())) { 379 // If this is not src0 then it could be src1 380 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 381 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 382 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 383 384 if (!Src || 385 !isSameReg(*Src, *getReplacedOperand())) { 386 // It's possible this Src is a tied operand for 387 // UNUSED_PRESERVE, in which case we can either 388 // abandon the peephole attempt, or if legal we can 389 // copy the target operand into the tied slot 390 // if the preserve operation will effectively cause the same 391 // result by overwriting the rest of the dst. 392 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 393 MachineOperand *DstUnused = 394 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 395 396 if (Dst && 397 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 398 // This will work if the tied src is acessing WORD_0, and the dst is 399 // writing WORD_1. Modifiers don't matter because all the bits that 400 // would be impacted are being overwritten by the dst. 401 // Any other case will not work. 402 SdwaSel DstSel = static_cast<SdwaSel>( 403 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 404 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 405 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 406 IsPreserveSrc = true; 407 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 408 AMDGPU::OpName::vdst); 409 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 410 Src = &MI.getOperand(TiedIdx); 411 SrcSel = nullptr; 412 SrcMods = nullptr; 413 } else { 414 // Not legal to convert this src 415 return false; 416 } 417 } 418 } 419 assert(Src && Src->isReg()); 420 421 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 422 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 423 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 424 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 425 !isSameReg(*Src, *getReplacedOperand())) { 426 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 427 // src2. This is not allowed. 428 return false; 429 } 430 431 assert(isSameReg(*Src, *getReplacedOperand()) && 432 (IsPreserveSrc || (SrcSel && SrcMods))); 433 } 434 copyRegOperand(*Src, *getTargetOperand()); 435 if (!IsPreserveSrc) { 436 SrcSel->setImm(getSrcSel()); 437 SrcMods->setImm(getSrcMods(TII, Src)); 438 } 439 getTargetOperand()->setIsKill(false); 440 return true; 441 } 442 443 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 444 // For SDWA dst operand potential instruction is one that defines register 445 // that this operand uses 446 MachineRegisterInfo *MRI = getMRI(); 447 MachineInstr *ParentMI = getParentInst(); 448 449 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 450 if (!PotentialMO) 451 return nullptr; 452 453 // Check that ParentMI is the only instruction that uses replaced register 454 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 455 if (&UseInst != ParentMI) 456 return nullptr; 457 } 458 459 return PotentialMO->getParent(); 460 } 461 462 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 463 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 464 465 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 466 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 467 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 468 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 469 getDstSel() != AMDGPU::SDWA::DWORD) { 470 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 471 return false; 472 } 473 474 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 475 assert(Operand && 476 Operand->isReg() && 477 isSameReg(*Operand, *getReplacedOperand())); 478 copyRegOperand(*Operand, *getTargetOperand()); 479 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 480 assert(DstSel); 481 DstSel->setImm(getDstSel()); 482 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 483 assert(DstUnused); 484 DstUnused->setImm(getDstUnused()); 485 486 // Remove original instruction because it would conflict with our new 487 // instruction by register definition 488 getParentInst()->eraseFromParent(); 489 return true; 490 } 491 492 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 493 const SIInstrInfo *TII) { 494 // MI should be moved right before v_or_b32. 495 // For this we should clear all kill flags on uses of MI src-operands or else 496 // we can encounter problem with use of killed operand. 497 for (MachineOperand &MO : MI.uses()) { 498 if (!MO.isReg()) 499 continue; 500 getMRI()->clearKillFlags(MO.getReg()); 501 } 502 503 // Move MI before v_or_b32 504 auto MBB = MI.getParent(); 505 MBB->remove(&MI); 506 MBB->insert(getParentInst(), &MI); 507 508 // Add Implicit use of preserved register 509 MachineInstrBuilder MIB(*MBB->getParent(), MI); 510 MIB.addReg(getPreservedOperand()->getReg(), 511 RegState::ImplicitKill, 512 getPreservedOperand()->getSubReg()); 513 514 // Tie dst to implicit use 515 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 516 MI.getNumOperands() - 1); 517 518 // Convert MI as any other SDWADstOperand and remove v_or_b32 519 return SDWADstOperand::convertToSDWA(MI, TII); 520 } 521 522 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 523 if (Op.isImm()) { 524 return Op.getImm(); 525 } 526 527 // If this is not immediate then it can be copy of immediate value, e.g.: 528 // %1 = S_MOV_B32 255; 529 if (Op.isReg()) { 530 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 531 if (!isSameReg(Op, Def)) 532 continue; 533 534 const MachineInstr *DefInst = Def.getParent(); 535 if (!TII->isFoldableCopy(*DefInst)) 536 return None; 537 538 const MachineOperand &Copied = DefInst->getOperand(1); 539 if (!Copied.isImm()) 540 return None; 541 542 return Copied.getImm(); 543 } 544 } 545 546 return None; 547 } 548 549 std::unique_ptr<SDWAOperand> 550 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 551 unsigned Opcode = MI.getOpcode(); 552 switch (Opcode) { 553 case AMDGPU::V_LSHRREV_B32_e32: 554 case AMDGPU::V_ASHRREV_I32_e32: 555 case AMDGPU::V_LSHLREV_B32_e32: 556 case AMDGPU::V_LSHRREV_B32_e64: 557 case AMDGPU::V_ASHRREV_I32_e64: 558 case AMDGPU::V_LSHLREV_B32_e64: { 559 // from: v_lshrrev_b32_e32 v1, 16/24, v0 560 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 561 562 // from: v_ashrrev_i32_e32 v1, 16/24, v0 563 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 564 565 // from: v_lshlrev_b32_e32 v1, 16/24, v0 566 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 567 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 568 auto Imm = foldToImm(*Src0); 569 if (!Imm) 570 break; 571 572 if (*Imm != 16 && *Imm != 24) 573 break; 574 575 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 576 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 577 if (TRI->isPhysicalRegister(Src1->getReg()) || 578 TRI->isPhysicalRegister(Dst->getReg())) 579 break; 580 581 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 582 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 583 return make_unique<SDWADstOperand>( 584 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 585 } else { 586 return make_unique<SDWASrcOperand>( 587 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 588 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 589 Opcode != AMDGPU::V_LSHRREV_B32_e64); 590 } 591 break; 592 } 593 594 case AMDGPU::V_LSHRREV_B16_e32: 595 case AMDGPU::V_ASHRREV_I16_e32: 596 case AMDGPU::V_LSHLREV_B16_e32: 597 case AMDGPU::V_LSHRREV_B16_e64: 598 case AMDGPU::V_ASHRREV_I16_e64: 599 case AMDGPU::V_LSHLREV_B16_e64: { 600 // from: v_lshrrev_b16_e32 v1, 8, v0 601 // to SDWA src:v0 src_sel:BYTE_1 602 603 // from: v_ashrrev_i16_e32 v1, 8, v0 604 // to SDWA src:v0 src_sel:BYTE_1 sext:1 605 606 // from: v_lshlrev_b16_e32 v1, 8, v0 607 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 608 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 609 auto Imm = foldToImm(*Src0); 610 if (!Imm || *Imm != 8) 611 break; 612 613 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 614 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 615 616 if (TRI->isPhysicalRegister(Src1->getReg()) || 617 TRI->isPhysicalRegister(Dst->getReg())) 618 break; 619 620 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 621 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 622 return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 623 } else { 624 return make_unique<SDWASrcOperand>( 625 Src1, Dst, BYTE_1, false, false, 626 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 627 Opcode != AMDGPU::V_LSHRREV_B16_e64); 628 } 629 break; 630 } 631 632 case AMDGPU::V_BFE_I32: 633 case AMDGPU::V_BFE_U32: { 634 // e.g.: 635 // from: v_bfe_u32 v1, v0, 8, 8 636 // to SDWA src:v0 src_sel:BYTE_1 637 638 // offset | width | src_sel 639 // ------------------------ 640 // 0 | 8 | BYTE_0 641 // 0 | 16 | WORD_0 642 // 0 | 32 | DWORD ? 643 // 8 | 8 | BYTE_1 644 // 16 | 8 | BYTE_2 645 // 16 | 16 | WORD_1 646 // 24 | 8 | BYTE_3 647 648 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 649 auto Offset = foldToImm(*Src1); 650 if (!Offset) 651 break; 652 653 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 654 auto Width = foldToImm(*Src2); 655 if (!Width) 656 break; 657 658 SdwaSel SrcSel = DWORD; 659 660 if (*Offset == 0 && *Width == 8) 661 SrcSel = BYTE_0; 662 else if (*Offset == 0 && *Width == 16) 663 SrcSel = WORD_0; 664 else if (*Offset == 0 && *Width == 32) 665 SrcSel = DWORD; 666 else if (*Offset == 8 && *Width == 8) 667 SrcSel = BYTE_1; 668 else if (*Offset == 16 && *Width == 8) 669 SrcSel = BYTE_2; 670 else if (*Offset == 16 && *Width == 16) 671 SrcSel = WORD_1; 672 else if (*Offset == 24 && *Width == 8) 673 SrcSel = BYTE_3; 674 else 675 break; 676 677 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 678 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 679 680 if (TRI->isPhysicalRegister(Src0->getReg()) || 681 TRI->isPhysicalRegister(Dst->getReg())) 682 break; 683 684 return make_unique<SDWASrcOperand>( 685 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 686 } 687 688 case AMDGPU::V_AND_B32_e32: 689 case AMDGPU::V_AND_B32_e64: { 690 // e.g.: 691 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 692 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 693 694 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 695 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 696 auto ValSrc = Src1; 697 auto Imm = foldToImm(*Src0); 698 699 if (!Imm) { 700 Imm = foldToImm(*Src1); 701 ValSrc = Src0; 702 } 703 704 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 705 break; 706 707 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 708 709 if (TRI->isPhysicalRegister(ValSrc->getReg()) || 710 TRI->isPhysicalRegister(Dst->getReg())) 711 break; 712 713 return make_unique<SDWASrcOperand>( 714 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 715 } 716 717 case AMDGPU::V_OR_B32_e32: 718 case AMDGPU::V_OR_B32_e64: { 719 // Patterns for dst_unused:UNUSED_PRESERVE. 720 // e.g., from: 721 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 722 // src1_sel:WORD_1 src2_sel:WORD1 723 // v_add_f16_e32 v3, v1, v2 724 // v_or_b32_e32 v4, v0, v3 725 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 726 727 // Check if one of operands of v_or_b32 is SDWA instruction 728 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 729 auto CheckOROperandsForSDWA = 730 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 731 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 732 return CheckRetType(None); 733 734 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 735 if (!Op1Def) 736 return CheckRetType(None); 737 738 MachineInstr *Op1Inst = Op1Def->getParent(); 739 if (!TII->isSDWA(*Op1Inst)) 740 return CheckRetType(None); 741 742 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 743 if (!Op2Def) 744 return CheckRetType(None); 745 746 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 747 }; 748 749 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 750 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 751 assert(OrSDWA && OrOther); 752 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 753 if (!Res) { 754 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 755 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 756 assert(OrSDWA && OrOther); 757 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 758 if (!Res) 759 break; 760 } 761 762 MachineOperand *OrSDWADef = Res->first; 763 MachineOperand *OrOtherDef = Res->second; 764 assert(OrSDWADef && OrOtherDef); 765 766 MachineInstr *SDWAInst = OrSDWADef->getParent(); 767 MachineInstr *OtherInst = OrOtherDef->getParent(); 768 769 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 770 // destination patterns don't overlap. Compatible instruction can be either 771 // regular instruction with compatible bitness or SDWA instruction with 772 // correct dst_sel 773 // SDWAInst | OtherInst bitness / OtherInst dst_sel 774 // ----------------------------------------------------- 775 // DWORD | no / no 776 // WORD_0 | no / BYTE_2/3, WORD_1 777 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 778 // BYTE_0 | no / BYTE_1/2/3, WORD_1 779 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 780 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 781 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 782 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 783 // but v_add_f32 is not. 784 785 // TODO: add support for non-SDWA instructions as OtherInst. 786 // For now this only works with SDWA instructions. For regular instructions 787 // there is no way to determine if the instruction writes only 8/16/24-bit 788 // out of full register size and all registers are at min 32-bit wide. 789 if (!TII->isSDWA(*OtherInst)) 790 break; 791 792 SdwaSel DstSel = static_cast<SdwaSel>( 793 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 794 SdwaSel OtherDstSel = static_cast<SdwaSel>( 795 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 796 797 bool DstSelAgree = false; 798 switch (DstSel) { 799 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 800 (OtherDstSel == BYTE_3) || 801 (OtherDstSel == WORD_1)); 802 break; 803 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 804 (OtherDstSel == BYTE_1) || 805 (OtherDstSel == WORD_0)); 806 break; 807 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 808 (OtherDstSel == BYTE_2) || 809 (OtherDstSel == BYTE_3) || 810 (OtherDstSel == WORD_1)); 811 break; 812 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 813 (OtherDstSel == BYTE_2) || 814 (OtherDstSel == BYTE_3) || 815 (OtherDstSel == WORD_1)); 816 break; 817 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 818 (OtherDstSel == BYTE_1) || 819 (OtherDstSel == BYTE_3) || 820 (OtherDstSel == WORD_0)); 821 break; 822 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 823 (OtherDstSel == BYTE_1) || 824 (OtherDstSel == BYTE_2) || 825 (OtherDstSel == WORD_0)); 826 break; 827 default: DstSelAgree = false; 828 } 829 830 if (!DstSelAgree) 831 break; 832 833 // Also OtherInst dst_unused should be UNUSED_PAD 834 DstUnused OtherDstUnused = static_cast<DstUnused>( 835 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 836 if (OtherDstUnused != DstUnused::UNUSED_PAD) 837 break; 838 839 // Create DstPreserveOperand 840 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 841 assert(OrDst && OrDst->isReg()); 842 843 return make_unique<SDWADstPreserveOperand>( 844 OrDst, OrSDWADef, OrOtherDef, DstSel); 845 846 } 847 } 848 849 return std::unique_ptr<SDWAOperand>(nullptr); 850 } 851 852 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 853 for (MachineInstr &MI : MBB) { 854 if (auto Operand = matchSDWAOperand(MI)) { 855 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 856 SDWAOperands[&MI] = std::move(Operand); 857 ++NumSDWAPatternsFound; 858 } 859 } 860 } 861 862 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 863 // V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA 864 // to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. 865 // 866 // We are transforming from a VOP3 into a VOP2 form of the instruction. 867 // %19:vgpr_32 = V_AND_B32_e32 255, 868 // killed %16:vgpr_32, implicit $exec 869 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 870 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 871 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 872 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 873 // 874 // becomes 875 // %47:vgpr_32 = V_ADD_I32_sdwa 876 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 877 // implicit-def $vcc, implicit $exec 878 // %48:vgpr_32 = V_ADDC_U32_e32 879 // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 880 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 881 const GCNSubtarget &ST) const { 882 int Opc = MI.getOpcode(); 883 assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && 884 "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); 885 886 // Can the candidate MI be shrunk? 887 if (!TII->canShrink(MI, *MRI)) 888 return; 889 Opc = AMDGPU::getVOPe32(Opc); 890 // Find the related ADD instruction. 891 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 892 if (!Sdst) 893 return; 894 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 895 if (!NextOp) 896 return; 897 MachineInstr &MISucc = *NextOp->getParent(); 898 // Can the successor be shrunk? 899 if (!TII->canShrink(MISucc, *MRI)) 900 return; 901 int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 902 // Make sure the carry in/out are subsequently unused. 903 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 904 if (!CarryIn) 905 return; 906 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 907 if (!CarryOut) 908 return; 909 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 910 return; 911 // Make sure VCC or its subregs are dead before MI. 912 MachineBasicBlock &MBB = *MI.getParent(); 913 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 914 if (Liveness != MachineBasicBlock::LQR_Dead) 915 return; 916 // Check if VCC is referenced in range of (MI,MISucc]. 917 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 918 I != E; ++I) { 919 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 920 return; 921 } 922 // Make the two new e32 instruction variants. 923 // Replace MI with V_{SUB|ADD}_I32_e32 924 auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); 925 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); 926 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 927 NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); 928 MI.eraseFromParent(); 929 // Replace MISucc with V_{SUBB|ADDC}_U32_e32 930 auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); 931 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); 932 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); 933 NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); 934 MISucc.eraseFromParent(); 935 } 936 937 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 938 const GCNSubtarget &ST) const { 939 // Check if this is already an SDWA instruction 940 unsigned Opc = MI.getOpcode(); 941 if (TII->isSDWA(Opc)) 942 return true; 943 944 // Check if this instruction has opcode that supports SDWA 945 if (AMDGPU::getSDWAOp(Opc) == -1) 946 Opc = AMDGPU::getVOPe32(Opc); 947 948 if (AMDGPU::getSDWAOp(Opc) == -1) 949 return false; 950 951 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 952 return false; 953 954 if (TII->isVOPC(Opc)) { 955 if (!ST.hasSDWASdst()) { 956 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 957 if (SDst && (SDst->getReg() != AMDGPU::VCC && 958 SDst->getReg() != AMDGPU::VCC_LO)) 959 return false; 960 } 961 962 if (!ST.hasSDWAOutModsVOPC() && 963 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 964 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 965 return false; 966 967 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 968 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 969 return false; 970 } 971 972 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 973 Opc == AMDGPU::V_FMAC_F32_e32 || 974 Opc == AMDGPU::V_MAC_F16_e32 || 975 Opc == AMDGPU::V_MAC_F32_e32)) 976 return false; 977 978 // Check if target supports this SDWA opcode 979 if (TII->pseudoToMCOpcode(Opc) == -1) 980 return false; 981 982 // FIXME: has SDWA but require handling of implicit VCC use 983 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 984 return false; 985 986 return true; 987 } 988 989 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 990 const SDWAOperandsVector &SDWAOperands) { 991 992 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 993 994 // Convert to sdwa 995 int SDWAOpcode; 996 unsigned Opcode = MI.getOpcode(); 997 if (TII->isSDWA(Opcode)) { 998 SDWAOpcode = Opcode; 999 } else { 1000 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 1001 if (SDWAOpcode == -1) 1002 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 1003 } 1004 assert(SDWAOpcode != -1); 1005 1006 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 1007 1008 // Create SDWA version of instruction MI and initialize its operands 1009 MachineInstrBuilder SDWAInst = 1010 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 1011 1012 // Copy dst, if it is present in original then should also be present in SDWA 1013 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1014 if (Dst) { 1015 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 1016 SDWAInst.add(*Dst); 1017 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1018 assert(Dst && 1019 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1020 SDWAInst.add(*Dst); 1021 } else { 1022 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1023 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1024 } 1025 1026 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1027 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1028 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1029 assert( 1030 Src0 && 1031 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 1032 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 1033 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1034 SDWAInst.addImm(Mod->getImm()); 1035 else 1036 SDWAInst.addImm(0); 1037 SDWAInst.add(*Src0); 1038 1039 // Copy src1 if present, initialize src1_modifiers. 1040 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1041 if (Src1) { 1042 assert( 1043 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 1044 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 1045 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1046 SDWAInst.addImm(Mod->getImm()); 1047 else 1048 SDWAInst.addImm(0); 1049 SDWAInst.add(*Src1); 1050 } 1051 1052 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1053 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1054 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1055 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1056 // v_mac_f16/32 has additional src2 operand tied to vdst 1057 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1058 assert(Src2); 1059 SDWAInst.add(*Src2); 1060 } 1061 1062 // Copy clamp if present, initialize otherwise 1063 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 1064 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1065 if (Clamp) { 1066 SDWAInst.add(*Clamp); 1067 } else { 1068 SDWAInst.addImm(0); 1069 } 1070 1071 // Copy omod if present, initialize otherwise if needed 1072 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 1073 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1074 if (OMod) { 1075 SDWAInst.add(*OMod); 1076 } else { 1077 SDWAInst.addImm(0); 1078 } 1079 } 1080 1081 // Copy dst_sel if present, initialize otherwise if needed 1082 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 1083 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1084 if (DstSel) { 1085 SDWAInst.add(*DstSel); 1086 } else { 1087 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1088 } 1089 } 1090 1091 // Copy dst_unused if present, initialize otherwise if needed 1092 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1093 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1094 if (DstUnused) { 1095 SDWAInst.add(*DstUnused); 1096 } else { 1097 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1098 } 1099 } 1100 1101 // Copy src0_sel if present, initialize otherwise 1102 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1103 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1104 if (Src0Sel) { 1105 SDWAInst.add(*Src0Sel); 1106 } else { 1107 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1108 } 1109 1110 // Copy src1_sel if present, initialize otherwise if needed 1111 if (Src1) { 1112 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1113 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1114 if (Src1Sel) { 1115 SDWAInst.add(*Src1Sel); 1116 } else { 1117 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1118 } 1119 } 1120 1121 // Check for a preserved register that needs to be copied. 1122 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1123 if (DstUnused && 1124 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1125 // We expect, if we are here, that the instruction was already in it's SDWA form, 1126 // with a tied operand. 1127 assert(Dst && Dst->isTied()); 1128 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1129 // We also expect a vdst, since sdst can't preserve. 1130 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1131 assert(PreserveDstIdx != -1); 1132 1133 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1134 auto Tied = MI.getOperand(TiedIdx); 1135 1136 SDWAInst.add(Tied); 1137 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1138 } 1139 1140 // Apply all sdwa operand patterns. 1141 bool Converted = false; 1142 for (auto &Operand : SDWAOperands) { 1143 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1144 // There should be no intesection between SDWA operands and potential MIs 1145 // e.g.: 1146 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1147 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1148 // v_add_u32 v3, v4, v2 1149 // 1150 // In that example it is possible that we would fold 2nd instruction into 3rd 1151 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1152 // already destroyed). So if SDWAOperand is also a potential MI then do not 1153 // apply it. 1154 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1155 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1156 } 1157 if (Converted) { 1158 ConvertedInstructions.push_back(SDWAInst); 1159 } else { 1160 SDWAInst->eraseFromParent(); 1161 return false; 1162 } 1163 1164 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1165 ++NumSDWAInstructionsPeepholed; 1166 1167 MI.eraseFromParent(); 1168 return true; 1169 } 1170 1171 // If an instruction was converted to SDWA it should not have immediates or SGPR 1172 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1173 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1174 const GCNSubtarget &ST) const { 1175 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1176 unsigned ConstantBusCount = 0; 1177 for (MachineOperand &Op : MI.explicit_uses()) { 1178 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1179 continue; 1180 1181 unsigned I = MI.getOperandNo(&Op); 1182 if (Desc.OpInfo[I].RegClass == -1 || 1183 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1184 continue; 1185 1186 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1187 TRI->isSGPRReg(*MRI, Op.getReg())) { 1188 ++ConstantBusCount; 1189 continue; 1190 } 1191 1192 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1193 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1194 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1195 if (Op.isImm()) 1196 Copy.addImm(Op.getImm()); 1197 else if (Op.isReg()) 1198 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1199 Op.getSubReg()); 1200 Op.ChangeToRegister(VGPR, false); 1201 } 1202 } 1203 1204 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1205 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1206 1207 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1208 return false; 1209 1210 MRI = &MF.getRegInfo(); 1211 TRI = ST.getRegisterInfo(); 1212 TII = ST.getInstrInfo(); 1213 1214 // Find all SDWA operands in MF. 1215 bool Ret = false; 1216 for (MachineBasicBlock &MBB : MF) { 1217 bool Changed = false; 1218 do { 1219 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1220 // Look for a possible ADD or SUB that resulted from a previously lowered 1221 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1222 // lowers the pair of instructions into e32 form. 1223 matchSDWAOperands(MBB); 1224 for (const auto &OperandPair : SDWAOperands) { 1225 const auto &Operand = OperandPair.second; 1226 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1227 if (PotentialMI && 1228 (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || 1229 PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) 1230 pseudoOpConvertToVOP2(*PotentialMI, ST); 1231 } 1232 SDWAOperands.clear(); 1233 1234 // Generate potential match list. 1235 matchSDWAOperands(MBB); 1236 1237 for (const auto &OperandPair : SDWAOperands) { 1238 const auto &Operand = OperandPair.second; 1239 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1240 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1241 PotentialMatches[PotentialMI].push_back(Operand.get()); 1242 } 1243 } 1244 1245 for (auto &PotentialPair : PotentialMatches) { 1246 MachineInstr &PotentialMI = *PotentialPair.first; 1247 convertToSDWA(PotentialMI, PotentialPair.second); 1248 } 1249 1250 PotentialMatches.clear(); 1251 SDWAOperands.clear(); 1252 1253 Changed = !ConvertedInstructions.empty(); 1254 1255 if (Changed) 1256 Ret = true; 1257 while (!ConvertedInstructions.empty()) 1258 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1259 } while (Changed); 1260 } 1261 1262 return Ret; 1263 } 1264