1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass tries to apply several peephole SDWA patterns. 10 /// 11 /// E.g. original: 12 /// V_LSHRREV_B32_e32 %0, 16, %1 13 /// V_ADD_CO_U32_e32 %2, %0, %3 14 /// V_LSHLREV_B32_e32 %4, 16, %2 15 /// 16 /// Replace: 17 /// V_ADD_CO_U32_sdwa %4, %1, %3 18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 19 /// 20 //===----------------------------------------------------------------------===// 21 22 #include "AMDGPU.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 25 #include "llvm/ADT/MapVector.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "si-peephole-sdwa" 32 33 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 34 STATISTIC(NumSDWAInstructionsPeepholed, 35 "Number of instruction converted to SDWA."); 36 37 namespace { 38 39 class SDWAOperand; 40 class SDWADstOperand; 41 42 class SIPeepholeSDWA : public MachineFunctionPass { 43 public: 44 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 45 46 private: 47 MachineRegisterInfo *MRI; 48 const SIRegisterInfo *TRI; 49 const SIInstrInfo *TII; 50 51 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 52 MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; 53 SmallVector<MachineInstr *, 8> ConvertedInstructions; 54 55 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 56 57 public: 58 static char ID; 59 60 SIPeepholeSDWA() : MachineFunctionPass(ID) { 61 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 62 } 63 64 bool runOnMachineFunction(MachineFunction &MF) override; 65 void matchSDWAOperands(MachineBasicBlock &MBB); 66 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 67 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 68 void pseudoOpConvertToVOP2(MachineInstr &MI, 69 const GCNSubtarget &ST) const; 70 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 71 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 72 73 StringRef getPassName() const override { return "SI Peephole SDWA"; } 74 75 void getAnalysisUsage(AnalysisUsage &AU) const override { 76 AU.setPreservesCFG(); 77 MachineFunctionPass::getAnalysisUsage(AU); 78 } 79 }; 80 81 class SDWAOperand { 82 private: 83 MachineOperand *Target; // Operand that would be used in converted instruction 84 MachineOperand *Replaced; // Operand that would be replace by Target 85 86 public: 87 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 88 : Target(TargetOp), Replaced(ReplacedOp) { 89 assert(Target->isReg()); 90 assert(Replaced->isReg()); 91 } 92 93 virtual ~SDWAOperand() = default; 94 95 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 96 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 97 98 MachineOperand *getTargetOperand() const { return Target; } 99 MachineOperand *getReplacedOperand() const { return Replaced; } 100 MachineInstr *getParentInst() const { return Target->getParent(); } 101 102 MachineRegisterInfo *getMRI() const { 103 return &getParentInst()->getParent()->getParent()->getRegInfo(); 104 } 105 106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 107 virtual void print(raw_ostream& OS) const = 0; 108 void dump() const { print(dbgs()); } 109 #endif 110 }; 111 112 using namespace AMDGPU::SDWA; 113 114 class SDWASrcOperand : public SDWAOperand { 115 private: 116 SdwaSel SrcSel; 117 bool Abs; 118 bool Neg; 119 bool Sext; 120 121 public: 122 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 123 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 124 bool Sext_ = false) 125 : SDWAOperand(TargetOp, ReplacedOp), 126 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 127 128 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 129 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 130 131 SdwaSel getSrcSel() const { return SrcSel; } 132 bool getAbs() const { return Abs; } 133 bool getNeg() const { return Neg; } 134 bool getSext() const { return Sext; } 135 136 uint64_t getSrcMods(const SIInstrInfo *TII, 137 const MachineOperand *SrcOp) const; 138 139 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 140 void print(raw_ostream& OS) const override; 141 #endif 142 }; 143 144 class SDWADstOperand : public SDWAOperand { 145 private: 146 SdwaSel DstSel; 147 DstUnused DstUn; 148 149 public: 150 151 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 152 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 153 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 154 155 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 156 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 157 158 SdwaSel getDstSel() const { return DstSel; } 159 DstUnused getDstUnused() const { return DstUn; } 160 161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 162 void print(raw_ostream& OS) const override; 163 #endif 164 }; 165 166 class SDWADstPreserveOperand : public SDWADstOperand { 167 private: 168 MachineOperand *Preserve; 169 170 public: 171 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 172 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 173 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 174 Preserve(PreserveOp) {} 175 176 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 177 178 MachineOperand *getPreservedOperand() const { return Preserve; } 179 180 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 181 void print(raw_ostream& OS) const override; 182 #endif 183 }; 184 185 } // end anonymous namespace 186 187 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 188 189 char SIPeepholeSDWA::ID = 0; 190 191 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 192 193 FunctionPass *llvm::createSIPeepholeSDWAPass() { 194 return new SIPeepholeSDWA(); 195 } 196 197 198 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 199 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 200 switch(Sel) { 201 case BYTE_0: OS << "BYTE_0"; break; 202 case BYTE_1: OS << "BYTE_1"; break; 203 case BYTE_2: OS << "BYTE_2"; break; 204 case BYTE_3: OS << "BYTE_3"; break; 205 case WORD_0: OS << "WORD_0"; break; 206 case WORD_1: OS << "WORD_1"; break; 207 case DWORD: OS << "DWORD"; break; 208 } 209 return OS; 210 } 211 212 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 213 switch(Un) { 214 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 215 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 216 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 217 } 218 return OS; 219 } 220 221 LLVM_DUMP_METHOD 222 void SDWASrcOperand::print(raw_ostream& OS) const { 223 OS << "SDWA src: " << *getTargetOperand() 224 << " src_sel:" << getSrcSel() 225 << " abs:" << getAbs() << " neg:" << getNeg() 226 << " sext:" << getSext() << '\n'; 227 } 228 229 LLVM_DUMP_METHOD 230 void SDWADstOperand::print(raw_ostream& OS) const { 231 OS << "SDWA dst: " << *getTargetOperand() 232 << " dst_sel:" << getDstSel() 233 << " dst_unused:" << getDstUnused() << '\n'; 234 } 235 236 LLVM_DUMP_METHOD 237 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 238 OS << "SDWA preserve dst: " << *getTargetOperand() 239 << " dst_sel:" << getDstSel() 240 << " preserve:" << *getPreservedOperand() << '\n'; 241 } 242 243 #endif 244 245 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 246 assert(To.isReg() && From.isReg()); 247 To.setReg(From.getReg()); 248 To.setSubReg(From.getSubReg()); 249 To.setIsUndef(From.isUndef()); 250 if (To.isUse()) { 251 To.setIsKill(From.isKill()); 252 } else { 253 To.setIsDead(From.isDead()); 254 } 255 } 256 257 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 258 return LHS.isReg() && 259 RHS.isReg() && 260 LHS.getReg() == RHS.getReg() && 261 LHS.getSubReg() == RHS.getSubReg(); 262 } 263 264 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 265 const MachineRegisterInfo *MRI) { 266 if (!Reg->isReg() || !Reg->isDef()) 267 return nullptr; 268 269 MachineOperand *ResMO = nullptr; 270 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 271 // If there exist use of subreg of Reg then return nullptr 272 if (!isSameReg(UseMO, *Reg)) 273 return nullptr; 274 275 // Check that there is only one instruction that uses Reg 276 if (!ResMO) { 277 ResMO = &UseMO; 278 } else if (ResMO->getParent() != UseMO.getParent()) { 279 return nullptr; 280 } 281 } 282 283 return ResMO; 284 } 285 286 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 287 const MachineRegisterInfo *MRI) { 288 if (!Reg->isReg()) 289 return nullptr; 290 291 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 292 if (!DefInstr) 293 return nullptr; 294 295 for (auto &DefMO : DefInstr->defs()) { 296 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 297 return &DefMO; 298 } 299 300 // Ignore implicit defs. 301 return nullptr; 302 } 303 304 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 305 const MachineOperand *SrcOp) const { 306 uint64_t Mods = 0; 307 const auto *MI = SrcOp->getParent(); 308 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 309 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 310 Mods = Mod->getImm(); 311 } 312 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 313 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 314 Mods = Mod->getImm(); 315 } 316 } 317 if (Abs || Neg) { 318 assert(!Sext && 319 "Float and integer src modifiers can't be set simulteniously"); 320 Mods |= Abs ? SISrcMods::ABS : 0u; 321 Mods ^= Neg ? SISrcMods::NEG : 0u; 322 } else if (Sext) { 323 Mods |= SISrcMods::SEXT; 324 } 325 326 return Mods; 327 } 328 329 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 330 // For SDWA src operand potential instruction is one that use register 331 // defined by parent instruction 332 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 333 if (!PotentialMO) 334 return nullptr; 335 336 return PotentialMO->getParent(); 337 } 338 339 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 340 // Find operand in instruction that matches source operand and replace it with 341 // target operand. Set corresponding src_sel 342 bool IsPreserveSrc = false; 343 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 344 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 345 MachineOperand *SrcMods = 346 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 347 assert(Src && (Src->isReg() || Src->isImm())); 348 if (!isSameReg(*Src, *getReplacedOperand())) { 349 // If this is not src0 then it could be src1 350 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 351 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 352 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 353 354 if (!Src || 355 !isSameReg(*Src, *getReplacedOperand())) { 356 // It's possible this Src is a tied operand for 357 // UNUSED_PRESERVE, in which case we can either 358 // abandon the peephole attempt, or if legal we can 359 // copy the target operand into the tied slot 360 // if the preserve operation will effectively cause the same 361 // result by overwriting the rest of the dst. 362 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 363 MachineOperand *DstUnused = 364 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 365 366 if (Dst && 367 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 368 // This will work if the tied src is acessing WORD_0, and the dst is 369 // writing WORD_1. Modifiers don't matter because all the bits that 370 // would be impacted are being overwritten by the dst. 371 // Any other case will not work. 372 SdwaSel DstSel = static_cast<SdwaSel>( 373 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 374 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 375 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 376 IsPreserveSrc = true; 377 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 378 AMDGPU::OpName::vdst); 379 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 380 Src = &MI.getOperand(TiedIdx); 381 SrcSel = nullptr; 382 SrcMods = nullptr; 383 } else { 384 // Not legal to convert this src 385 return false; 386 } 387 } 388 } 389 assert(Src && Src->isReg()); 390 391 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 392 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 393 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 394 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 395 !isSameReg(*Src, *getReplacedOperand())) { 396 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 397 // src2. This is not allowed. 398 return false; 399 } 400 401 assert(isSameReg(*Src, *getReplacedOperand()) && 402 (IsPreserveSrc || (SrcSel && SrcMods))); 403 } 404 copyRegOperand(*Src, *getTargetOperand()); 405 if (!IsPreserveSrc) { 406 SrcSel->setImm(getSrcSel()); 407 SrcMods->setImm(getSrcMods(TII, Src)); 408 } 409 getTargetOperand()->setIsKill(false); 410 return true; 411 } 412 413 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 414 // For SDWA dst operand potential instruction is one that defines register 415 // that this operand uses 416 MachineRegisterInfo *MRI = getMRI(); 417 MachineInstr *ParentMI = getParentInst(); 418 419 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 420 if (!PotentialMO) 421 return nullptr; 422 423 // Check that ParentMI is the only instruction that uses replaced register 424 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 425 if (&UseInst != ParentMI) 426 return nullptr; 427 } 428 429 return PotentialMO->getParent(); 430 } 431 432 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 433 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 434 435 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 436 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 437 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 438 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 439 getDstSel() != AMDGPU::SDWA::DWORD) { 440 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 441 return false; 442 } 443 444 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 445 assert(Operand && 446 Operand->isReg() && 447 isSameReg(*Operand, *getReplacedOperand())); 448 copyRegOperand(*Operand, *getTargetOperand()); 449 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 450 assert(DstSel); 451 DstSel->setImm(getDstSel()); 452 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 453 assert(DstUnused); 454 DstUnused->setImm(getDstUnused()); 455 456 // Remove original instruction because it would conflict with our new 457 // instruction by register definition 458 getParentInst()->eraseFromParent(); 459 return true; 460 } 461 462 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 463 const SIInstrInfo *TII) { 464 // MI should be moved right before v_or_b32. 465 // For this we should clear all kill flags on uses of MI src-operands or else 466 // we can encounter problem with use of killed operand. 467 for (MachineOperand &MO : MI.uses()) { 468 if (!MO.isReg()) 469 continue; 470 getMRI()->clearKillFlags(MO.getReg()); 471 } 472 473 // Move MI before v_or_b32 474 auto MBB = MI.getParent(); 475 MBB->remove(&MI); 476 MBB->insert(getParentInst(), &MI); 477 478 // Add Implicit use of preserved register 479 MachineInstrBuilder MIB(*MBB->getParent(), MI); 480 MIB.addReg(getPreservedOperand()->getReg(), 481 RegState::ImplicitKill, 482 getPreservedOperand()->getSubReg()); 483 484 // Tie dst to implicit use 485 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 486 MI.getNumOperands() - 1); 487 488 // Convert MI as any other SDWADstOperand and remove v_or_b32 489 return SDWADstOperand::convertToSDWA(MI, TII); 490 } 491 492 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 493 if (Op.isImm()) { 494 return Op.getImm(); 495 } 496 497 // If this is not immediate then it can be copy of immediate value, e.g.: 498 // %1 = S_MOV_B32 255; 499 if (Op.isReg()) { 500 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 501 if (!isSameReg(Op, Def)) 502 continue; 503 504 const MachineInstr *DefInst = Def.getParent(); 505 if (!TII->isFoldableCopy(*DefInst)) 506 return None; 507 508 const MachineOperand &Copied = DefInst->getOperand(1); 509 if (!Copied.isImm()) 510 return None; 511 512 return Copied.getImm(); 513 } 514 } 515 516 return None; 517 } 518 519 std::unique_ptr<SDWAOperand> 520 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 521 unsigned Opcode = MI.getOpcode(); 522 switch (Opcode) { 523 case AMDGPU::V_LSHRREV_B32_e32: 524 case AMDGPU::V_ASHRREV_I32_e32: 525 case AMDGPU::V_LSHLREV_B32_e32: 526 case AMDGPU::V_LSHRREV_B32_e64: 527 case AMDGPU::V_ASHRREV_I32_e64: 528 case AMDGPU::V_LSHLREV_B32_e64: { 529 // from: v_lshrrev_b32_e32 v1, 16/24, v0 530 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 531 532 // from: v_ashrrev_i32_e32 v1, 16/24, v0 533 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 534 535 // from: v_lshlrev_b32_e32 v1, 16/24, v0 536 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 537 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 538 auto Imm = foldToImm(*Src0); 539 if (!Imm) 540 break; 541 542 if (*Imm != 16 && *Imm != 24) 543 break; 544 545 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 546 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 547 if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) 548 break; 549 550 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 551 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 552 return std::make_unique<SDWADstOperand>( 553 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 554 } else { 555 return std::make_unique<SDWASrcOperand>( 556 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 557 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 558 Opcode != AMDGPU::V_LSHRREV_B32_e64); 559 } 560 break; 561 } 562 563 case AMDGPU::V_LSHRREV_B16_e32: 564 case AMDGPU::V_ASHRREV_I16_e32: 565 case AMDGPU::V_LSHLREV_B16_e32: 566 case AMDGPU::V_LSHRREV_B16_e64: 567 case AMDGPU::V_ASHRREV_I16_e64: 568 case AMDGPU::V_LSHLREV_B16_e64: { 569 // from: v_lshrrev_b16_e32 v1, 8, v0 570 // to SDWA src:v0 src_sel:BYTE_1 571 572 // from: v_ashrrev_i16_e32 v1, 8, v0 573 // to SDWA src:v0 src_sel:BYTE_1 sext:1 574 575 // from: v_lshlrev_b16_e32 v1, 8, v0 576 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 577 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 578 auto Imm = foldToImm(*Src0); 579 if (!Imm || *Imm != 8) 580 break; 581 582 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 583 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 584 585 if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) 586 break; 587 588 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 589 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 590 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 591 } else { 592 return std::make_unique<SDWASrcOperand>( 593 Src1, Dst, BYTE_1, false, false, 594 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 595 Opcode != AMDGPU::V_LSHRREV_B16_e64); 596 } 597 break; 598 } 599 600 case AMDGPU::V_BFE_I32_e64: 601 case AMDGPU::V_BFE_U32_e64: { 602 // e.g.: 603 // from: v_bfe_u32 v1, v0, 8, 8 604 // to SDWA src:v0 src_sel:BYTE_1 605 606 // offset | width | src_sel 607 // ------------------------ 608 // 0 | 8 | BYTE_0 609 // 0 | 16 | WORD_0 610 // 0 | 32 | DWORD ? 611 // 8 | 8 | BYTE_1 612 // 16 | 8 | BYTE_2 613 // 16 | 16 | WORD_1 614 // 24 | 8 | BYTE_3 615 616 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 617 auto Offset = foldToImm(*Src1); 618 if (!Offset) 619 break; 620 621 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 622 auto Width = foldToImm(*Src2); 623 if (!Width) 624 break; 625 626 SdwaSel SrcSel = DWORD; 627 628 if (*Offset == 0 && *Width == 8) 629 SrcSel = BYTE_0; 630 else if (*Offset == 0 && *Width == 16) 631 SrcSel = WORD_0; 632 else if (*Offset == 0 && *Width == 32) 633 SrcSel = DWORD; 634 else if (*Offset == 8 && *Width == 8) 635 SrcSel = BYTE_1; 636 else if (*Offset == 16 && *Width == 8) 637 SrcSel = BYTE_2; 638 else if (*Offset == 16 && *Width == 16) 639 SrcSel = WORD_1; 640 else if (*Offset == 24 && *Width == 8) 641 SrcSel = BYTE_3; 642 else 643 break; 644 645 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 646 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 647 648 if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) 649 break; 650 651 return std::make_unique<SDWASrcOperand>( 652 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 653 } 654 655 case AMDGPU::V_AND_B32_e32: 656 case AMDGPU::V_AND_B32_e64: { 657 // e.g.: 658 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 659 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 660 661 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 662 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 663 auto ValSrc = Src1; 664 auto Imm = foldToImm(*Src0); 665 666 if (!Imm) { 667 Imm = foldToImm(*Src1); 668 ValSrc = Src0; 669 } 670 671 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 672 break; 673 674 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 675 676 if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) 677 break; 678 679 return std::make_unique<SDWASrcOperand>( 680 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 681 } 682 683 case AMDGPU::V_OR_B32_e32: 684 case AMDGPU::V_OR_B32_e64: { 685 // Patterns for dst_unused:UNUSED_PRESERVE. 686 // e.g., from: 687 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 688 // src1_sel:WORD_1 src2_sel:WORD1 689 // v_add_f16_e32 v3, v1, v2 690 // v_or_b32_e32 v4, v0, v3 691 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 692 693 // Check if one of operands of v_or_b32 is SDWA instruction 694 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 695 auto CheckOROperandsForSDWA = 696 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 697 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 698 return CheckRetType(None); 699 700 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 701 if (!Op1Def) 702 return CheckRetType(None); 703 704 MachineInstr *Op1Inst = Op1Def->getParent(); 705 if (!TII->isSDWA(*Op1Inst)) 706 return CheckRetType(None); 707 708 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 709 if (!Op2Def) 710 return CheckRetType(None); 711 712 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 713 }; 714 715 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 716 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 717 assert(OrSDWA && OrOther); 718 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 719 if (!Res) { 720 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 721 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 722 assert(OrSDWA && OrOther); 723 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 724 if (!Res) 725 break; 726 } 727 728 MachineOperand *OrSDWADef = Res->first; 729 MachineOperand *OrOtherDef = Res->second; 730 assert(OrSDWADef && OrOtherDef); 731 732 MachineInstr *SDWAInst = OrSDWADef->getParent(); 733 MachineInstr *OtherInst = OrOtherDef->getParent(); 734 735 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 736 // destination patterns don't overlap. Compatible instruction can be either 737 // regular instruction with compatible bitness or SDWA instruction with 738 // correct dst_sel 739 // SDWAInst | OtherInst bitness / OtherInst dst_sel 740 // ----------------------------------------------------- 741 // DWORD | no / no 742 // WORD_0 | no / BYTE_2/3, WORD_1 743 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 744 // BYTE_0 | no / BYTE_1/2/3, WORD_1 745 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 746 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 747 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 748 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 749 // but v_add_f32 is not. 750 751 // TODO: add support for non-SDWA instructions as OtherInst. 752 // For now this only works with SDWA instructions. For regular instructions 753 // there is no way to determine if the instruction writes only 8/16/24-bit 754 // out of full register size and all registers are at min 32-bit wide. 755 if (!TII->isSDWA(*OtherInst)) 756 break; 757 758 SdwaSel DstSel = static_cast<SdwaSel>( 759 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 760 SdwaSel OtherDstSel = static_cast<SdwaSel>( 761 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 762 763 bool DstSelAgree = false; 764 switch (DstSel) { 765 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 766 (OtherDstSel == BYTE_3) || 767 (OtherDstSel == WORD_1)); 768 break; 769 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 770 (OtherDstSel == BYTE_1) || 771 (OtherDstSel == WORD_0)); 772 break; 773 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 774 (OtherDstSel == BYTE_2) || 775 (OtherDstSel == BYTE_3) || 776 (OtherDstSel == WORD_1)); 777 break; 778 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 779 (OtherDstSel == BYTE_2) || 780 (OtherDstSel == BYTE_3) || 781 (OtherDstSel == WORD_1)); 782 break; 783 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 784 (OtherDstSel == BYTE_1) || 785 (OtherDstSel == BYTE_3) || 786 (OtherDstSel == WORD_0)); 787 break; 788 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 789 (OtherDstSel == BYTE_1) || 790 (OtherDstSel == BYTE_2) || 791 (OtherDstSel == WORD_0)); 792 break; 793 default: DstSelAgree = false; 794 } 795 796 if (!DstSelAgree) 797 break; 798 799 // Also OtherInst dst_unused should be UNUSED_PAD 800 DstUnused OtherDstUnused = static_cast<DstUnused>( 801 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 802 if (OtherDstUnused != DstUnused::UNUSED_PAD) 803 break; 804 805 // Create DstPreserveOperand 806 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 807 assert(OrDst && OrDst->isReg()); 808 809 return std::make_unique<SDWADstPreserveOperand>( 810 OrDst, OrSDWADef, OrOtherDef, DstSel); 811 812 } 813 } 814 815 return std::unique_ptr<SDWAOperand>(nullptr); 816 } 817 818 #if !defined(NDEBUG) 819 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 820 Operand.print(OS); 821 return OS; 822 } 823 #endif 824 825 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 826 for (MachineInstr &MI : MBB) { 827 if (auto Operand = matchSDWAOperand(MI)) { 828 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 829 SDWAOperands[&MI] = std::move(Operand); 830 ++NumSDWAPatternsFound; 831 } 832 } 833 } 834 835 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 836 // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA 837 // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa. 838 // 839 // We are transforming from a VOP3 into a VOP2 form of the instruction. 840 // %19:vgpr_32 = V_AND_B32_e32 255, 841 // killed %16:vgpr_32, implicit $exec 842 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 843 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 844 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 845 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 846 // 847 // becomes 848 // %47:vgpr_32 = V_ADD_CO_U32_sdwa 849 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 850 // implicit-def $vcc, implicit $exec 851 // %48:vgpr_32 = V_ADDC_U32_e32 852 // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 853 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 854 const GCNSubtarget &ST) const { 855 int Opc = MI.getOpcode(); 856 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 857 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 858 859 // Can the candidate MI be shrunk? 860 if (!TII->canShrink(MI, *MRI)) 861 return; 862 Opc = AMDGPU::getVOPe32(Opc); 863 // Find the related ADD instruction. 864 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 865 if (!Sdst) 866 return; 867 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 868 if (!NextOp) 869 return; 870 MachineInstr &MISucc = *NextOp->getParent(); 871 // Can the successor be shrunk? 872 if (!TII->canShrink(MISucc, *MRI)) 873 return; 874 int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 875 // Make sure the carry in/out are subsequently unused. 876 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 877 if (!CarryIn) 878 return; 879 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 880 if (!CarryOut) 881 return; 882 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 883 return; 884 // Make sure VCC or its subregs are dead before MI. 885 MachineBasicBlock &MBB = *MI.getParent(); 886 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 887 if (Liveness != MachineBasicBlock::LQR_Dead) 888 return; 889 // Check if VCC is referenced in range of (MI,MISucc]. 890 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 891 I != E; ++I) { 892 if (I->modifiesRegister(AMDGPU::VCC, TRI)) 893 return; 894 } 895 896 // Make the two new e32 instruction variants. 897 // Replace MI with V_{SUB|ADD}_I32_e32 898 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 899 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 900 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 901 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 902 .setMIFlags(MI.getFlags()); 903 904 MI.eraseFromParent(); 905 906 // Replace MISucc with V_{SUBB|ADDC}_U32_e32 907 BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) 908 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) 909 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) 910 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) 911 .setMIFlags(MISucc.getFlags()); 912 913 MISucc.eraseFromParent(); 914 } 915 916 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 917 const GCNSubtarget &ST) const { 918 // Check if this is already an SDWA instruction 919 unsigned Opc = MI.getOpcode(); 920 if (TII->isSDWA(Opc)) 921 return true; 922 923 // Check if this instruction has opcode that supports SDWA 924 if (AMDGPU::getSDWAOp(Opc) == -1) 925 Opc = AMDGPU::getVOPe32(Opc); 926 927 if (AMDGPU::getSDWAOp(Opc) == -1) 928 return false; 929 930 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 931 return false; 932 933 if (TII->isVOPC(Opc)) { 934 if (!ST.hasSDWASdst()) { 935 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 936 if (SDst && (SDst->getReg() != AMDGPU::VCC && 937 SDst->getReg() != AMDGPU::VCC_LO)) 938 return false; 939 } 940 941 if (!ST.hasSDWAOutModsVOPC() && 942 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 943 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 944 return false; 945 946 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 947 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 948 return false; 949 } 950 951 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 952 Opc == AMDGPU::V_FMAC_F32_e32 || 953 Opc == AMDGPU::V_MAC_F16_e32 || 954 Opc == AMDGPU::V_MAC_F32_e32)) 955 return false; 956 957 // Check if target supports this SDWA opcode 958 if (TII->pseudoToMCOpcode(Opc) == -1) 959 return false; 960 961 // FIXME: has SDWA but require handling of implicit VCC use 962 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 963 return false; 964 965 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 966 if (!Src0->isReg() && !Src0->isImm()) 967 return false; 968 } 969 970 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 971 if (!Src1->isReg() && !Src1->isImm()) 972 return false; 973 } 974 975 return true; 976 } 977 978 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 979 const SDWAOperandsVector &SDWAOperands) { 980 981 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 982 983 // Convert to sdwa 984 int SDWAOpcode; 985 unsigned Opcode = MI.getOpcode(); 986 if (TII->isSDWA(Opcode)) { 987 SDWAOpcode = Opcode; 988 } else { 989 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 990 if (SDWAOpcode == -1) 991 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 992 } 993 assert(SDWAOpcode != -1); 994 995 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 996 997 // Create SDWA version of instruction MI and initialize its operands 998 MachineInstrBuilder SDWAInst = 999 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 1000 .setMIFlags(MI.getFlags()); 1001 1002 // Copy dst, if it is present in original then should also be present in SDWA 1003 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 1004 if (Dst) { 1005 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 1006 SDWAInst.add(*Dst); 1007 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 1008 assert(Dst && 1009 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1010 SDWAInst.add(*Dst); 1011 } else { 1012 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 1013 SDWAInst.addReg(TRI->getVCC(), RegState::Define); 1014 } 1015 1016 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 1017 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 1018 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1019 assert( 1020 Src0 && 1021 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 1022 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 1023 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 1024 SDWAInst.addImm(Mod->getImm()); 1025 else 1026 SDWAInst.addImm(0); 1027 SDWAInst.add(*Src0); 1028 1029 // Copy src1 if present, initialize src1_modifiers. 1030 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 1031 if (Src1) { 1032 assert( 1033 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 1034 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 1035 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 1036 SDWAInst.addImm(Mod->getImm()); 1037 else 1038 SDWAInst.addImm(0); 1039 SDWAInst.add(*Src1); 1040 } 1041 1042 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 1043 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 1044 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 1045 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 1046 // v_mac_f16/32 has additional src2 operand tied to vdst 1047 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 1048 assert(Src2); 1049 SDWAInst.add(*Src2); 1050 } 1051 1052 // Copy clamp if present, initialize otherwise 1053 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 1054 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 1055 if (Clamp) { 1056 SDWAInst.add(*Clamp); 1057 } else { 1058 SDWAInst.addImm(0); 1059 } 1060 1061 // Copy omod if present, initialize otherwise if needed 1062 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 1063 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 1064 if (OMod) { 1065 SDWAInst.add(*OMod); 1066 } else { 1067 SDWAInst.addImm(0); 1068 } 1069 } 1070 1071 // Copy dst_sel if present, initialize otherwise if needed 1072 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 1073 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 1074 if (DstSel) { 1075 SDWAInst.add(*DstSel); 1076 } else { 1077 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1078 } 1079 } 1080 1081 // Copy dst_unused if present, initialize otherwise if needed 1082 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1083 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1084 if (DstUnused) { 1085 SDWAInst.add(*DstUnused); 1086 } else { 1087 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1088 } 1089 } 1090 1091 // Copy src0_sel if present, initialize otherwise 1092 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1093 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1094 if (Src0Sel) { 1095 SDWAInst.add(*Src0Sel); 1096 } else { 1097 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1098 } 1099 1100 // Copy src1_sel if present, initialize otherwise if needed 1101 if (Src1) { 1102 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1103 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1104 if (Src1Sel) { 1105 SDWAInst.add(*Src1Sel); 1106 } else { 1107 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1108 } 1109 } 1110 1111 // Check for a preserved register that needs to be copied. 1112 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1113 if (DstUnused && 1114 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1115 // We expect, if we are here, that the instruction was already in it's SDWA form, 1116 // with a tied operand. 1117 assert(Dst && Dst->isTied()); 1118 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1119 // We also expect a vdst, since sdst can't preserve. 1120 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1121 assert(PreserveDstIdx != -1); 1122 1123 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1124 auto Tied = MI.getOperand(TiedIdx); 1125 1126 SDWAInst.add(Tied); 1127 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1128 } 1129 1130 // Apply all sdwa operand patterns. 1131 bool Converted = false; 1132 for (auto &Operand : SDWAOperands) { 1133 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1134 // There should be no intesection between SDWA operands and potential MIs 1135 // e.g.: 1136 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1137 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1138 // v_add_u32 v3, v4, v2 1139 // 1140 // In that example it is possible that we would fold 2nd instruction into 3rd 1141 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1142 // already destroyed). So if SDWAOperand is also a potential MI then do not 1143 // apply it. 1144 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1145 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1146 } 1147 if (Converted) { 1148 ConvertedInstructions.push_back(SDWAInst); 1149 } else { 1150 SDWAInst->eraseFromParent(); 1151 return false; 1152 } 1153 1154 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1155 ++NumSDWAInstructionsPeepholed; 1156 1157 MI.eraseFromParent(); 1158 return true; 1159 } 1160 1161 // If an instruction was converted to SDWA it should not have immediates or SGPR 1162 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1163 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1164 const GCNSubtarget &ST) const { 1165 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1166 unsigned ConstantBusCount = 0; 1167 for (MachineOperand &Op : MI.explicit_uses()) { 1168 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1169 continue; 1170 1171 unsigned I = MI.getOperandNo(&Op); 1172 if (Desc.OpInfo[I].RegClass == -1 || 1173 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1174 continue; 1175 1176 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1177 TRI->isSGPRReg(*MRI, Op.getReg())) { 1178 ++ConstantBusCount; 1179 continue; 1180 } 1181 1182 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1183 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1184 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1185 if (Op.isImm()) 1186 Copy.addImm(Op.getImm()); 1187 else if (Op.isReg()) 1188 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1189 Op.getSubReg()); 1190 Op.ChangeToRegister(VGPR, false); 1191 } 1192 } 1193 1194 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1195 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1196 1197 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1198 return false; 1199 1200 MRI = &MF.getRegInfo(); 1201 TRI = ST.getRegisterInfo(); 1202 TII = ST.getInstrInfo(); 1203 1204 // Find all SDWA operands in MF. 1205 bool Ret = false; 1206 for (MachineBasicBlock &MBB : MF) { 1207 bool Changed = false; 1208 do { 1209 // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 1210 // Look for a possible ADD or SUB that resulted from a previously lowered 1211 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 1212 // lowers the pair of instructions into e32 form. 1213 matchSDWAOperands(MBB); 1214 for (const auto &OperandPair : SDWAOperands) { 1215 const auto &Operand = OperandPair.second; 1216 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1217 if (PotentialMI && 1218 (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 1219 PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) 1220 pseudoOpConvertToVOP2(*PotentialMI, ST); 1221 } 1222 SDWAOperands.clear(); 1223 1224 // Generate potential match list. 1225 matchSDWAOperands(MBB); 1226 1227 for (const auto &OperandPair : SDWAOperands) { 1228 const auto &Operand = OperandPair.second; 1229 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1230 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1231 PotentialMatches[PotentialMI].push_back(Operand.get()); 1232 } 1233 } 1234 1235 for (auto &PotentialPair : PotentialMatches) { 1236 MachineInstr &PotentialMI = *PotentialPair.first; 1237 convertToSDWA(PotentialMI, PotentialPair.second); 1238 } 1239 1240 PotentialMatches.clear(); 1241 SDWAOperands.clear(); 1242 1243 Changed = !ConvertedInstructions.empty(); 1244 1245 if (Changed) 1246 Ret = true; 1247 while (!ConvertedInstructions.empty()) 1248 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1249 } while (Changed); 1250 } 1251 1252 return Ret; 1253 } 1254