1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/CodeGen/MachineFrameInfo.h" 18 #include "llvm/CodeGen/MachineFunction.h" 19 #include "llvm/CodeGen/ScheduleDAG.h" 20 #include "llvm/TargetParser/TargetParser.h" 21 22 using namespace llvm; 23 24 namespace { 25 26 struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 28 29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 30 if (Arg.getAsInteger(0, Value)) 31 return O.error("'" + Arg + "' value invalid for uint argument!"); 32 33 if (Value > 100) 34 return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 35 36 return false; 37 } 38 }; 39 40 } // end anonymous namespace 41 42 static cl::opt<unsigned, false, MFMAPaddingRatioParser> 43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 44 cl::desc("Fill a percentage of the latency between " 45 "neighboring MFMA with s_nops.")); 46 47 // This is intended for debugging purposes only. 48 static cl::opt<unsigned> 49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, 50 cl::desc("Insert a s_nop x before every instruction")); 51 52 //===----------------------------------------------------------------------===// 53 // Hazard Recognizer Implementation 54 //===----------------------------------------------------------------------===// 55 56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 57 const GCNSubtarget &ST); 58 59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) 60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), 61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), 62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), 63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { 64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 66 } 67 68 void GCNHazardRecognizer::Reset() { 69 EmittedInstrs.clear(); 70 } 71 72 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 73 EmitInstruction(SU->getInstr()); 74 } 75 76 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 77 CurrCycleInstr = MI; 78 } 79 80 static bool isDivFMas(unsigned Opcode) { 81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 82 } 83 84 static bool isSGetReg(unsigned Opcode) { 85 return Opcode == AMDGPU::S_GETREG_B32; 86 } 87 88 static bool isSSetReg(unsigned Opcode) { 89 switch (Opcode) { 90 case AMDGPU::S_SETREG_B32: 91 case AMDGPU::S_SETREG_B32_mode: 92 case AMDGPU::S_SETREG_IMM32_B32: 93 case AMDGPU::S_SETREG_IMM32_B32_mode: 94 return true; 95 } 96 return false; 97 } 98 99 static bool isRWLane(unsigned Opcode) { 100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 101 } 102 103 static bool isRFE(unsigned Opcode) { 104 return Opcode == AMDGPU::S_RFE_B64; 105 } 106 107 static bool isSMovRel(unsigned Opcode) { 108 switch (Opcode) { 109 case AMDGPU::S_MOVRELS_B32: 110 case AMDGPU::S_MOVRELS_B64: 111 case AMDGPU::S_MOVRELD_B32: 112 case AMDGPU::S_MOVRELD_B64: 113 return true; 114 default: 115 return false; 116 } 117 } 118 119 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 120 const MachineInstr &MI) { 121 if (TII.isAlwaysGDS(MI.getOpcode())) 122 return true; 123 124 switch (MI.getOpcode()) { 125 case AMDGPU::S_SENDMSG: 126 case AMDGPU::S_SENDMSGHALT: 127 case AMDGPU::S_TTRACEDATA: 128 return true; 129 // These DS opcodes don't support GDS. 130 case AMDGPU::DS_NOP: 131 case AMDGPU::DS_PERMUTE_B32: 132 case AMDGPU::DS_BPERMUTE_B32: 133 return false; 134 default: 135 if (TII.isDS(MI.getOpcode())) { 136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 137 AMDGPU::OpName::gds); 138 if (MI.getOperand(GDS).getImm()) 139 return true; 140 } 141 return false; 142 } 143 } 144 145 static bool isPermlane(const MachineInstr &MI) { 146 unsigned Opcode = MI.getOpcode(); 147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 148 Opcode == AMDGPU::V_PERMLANE64_B32 || 149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 || 152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || 153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || 154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || 155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; 156 } 157 158 static bool isLdsDma(const MachineInstr &MI) { 159 return SIInstrInfo::isVALU(MI) && 160 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 161 } 162 163 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 164 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 165 AMDGPU::OpName::simm16); 166 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 167 } 168 169 ScheduleHazardRecognizer::HazardType 170 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 171 MachineInstr *MI = SU->getInstr(); 172 // If we are not in "HazardRecognizerMode" and therefore not being run from 173 // the scheduler, track possible stalls from hazards but don't insert noops. 174 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 175 176 if (MI->isBundle()) 177 return NoHazard; 178 179 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 180 return HazardType; 181 182 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 183 return HazardType; 184 185 if (checkFPAtomicToDenormModeHazard(MI) > 0) 186 return HazardType; 187 188 if (ST.hasNoDataDepHazard()) 189 return NoHazard; 190 191 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) 192 return HazardType; 193 194 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 195 return HazardType; 196 197 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 198 return HazardType; 199 200 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 201 return HazardType; 202 203 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 204 return HazardType; 205 206 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 207 SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) && 208 checkMAIVALUHazards(MI) > 0) 209 return HazardType; 210 211 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 212 return HazardType; 213 214 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 215 return HazardType; 216 217 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 218 return HazardType; 219 220 if (((ST.hasReadM0MovRelInterpHazard() && 221 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 222 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 223 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 224 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 225 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 226 (ST.hasReadM0LdsDirectHazard() && 227 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 228 checkReadM0Hazards(MI) > 0) 229 return HazardType; 230 231 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 232 return HazardType; 233 234 if ((SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI)) && 235 checkMAILdStHazards(MI) > 0) 236 return HazardType; 237 238 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 239 return HazardType; 240 241 return NoHazard; 242 } 243 244 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 245 unsigned Quantity) { 246 while (Quantity > 0) { 247 unsigned Arg = std::min(Quantity, 8u); 248 Quantity -= Arg; 249 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 250 .addImm(Arg - 1); 251 } 252 } 253 254 unsigned 255 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 256 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 257 assert(TSchedModel.getWriteProcResBegin(SC) != 258 TSchedModel.getWriteProcResEnd(SC)); 259 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 260 } 261 262 void GCNHazardRecognizer::processBundle() { 263 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 264 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 265 // Check bundled MachineInstr's for hazards. 266 for (; MI != E && MI->isInsideBundle(); ++MI) { 267 CurrCycleInstr = &*MI; 268 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 269 270 if (IsHazardRecognizerMode) { 271 fixHazards(CurrCycleInstr); 272 273 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 274 } 275 276 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 277 // include the bundled MI directly after, only add a maximum of 278 // (MaxLookAhead - 1) noops to EmittedInstrs. 279 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 280 EmittedInstrs.push_front(nullptr); 281 282 EmittedInstrs.push_front(CurrCycleInstr); 283 EmittedInstrs.resize(MaxLookAhead); 284 } 285 CurrCycleInstr = nullptr; 286 } 287 288 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 289 assert(IsHazardRecognizerMode); 290 291 unsigned NumPreNoops = PreEmitNoops(MI); 292 EmitNoops(NumPreNoops); 293 if (MI->isInsideBundle()) 294 insertNoopsInBundle(MI, TII, NumPreNoops); 295 else 296 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 297 NumPreNoops); 298 EmitInstruction(MI); 299 AdvanceCycle(); 300 } 301 302 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 303 IsHazardRecognizerMode = true; 304 CurrCycleInstr = MI; 305 unsigned W = PreEmitNoopsCommon(MI); 306 fixHazards(MI); 307 CurrCycleInstr = nullptr; 308 return std::max(W, NopPadding.getValue()); 309 } 310 311 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 312 if (MI->isBundle()) 313 return 0; 314 315 int WaitStates = 0; 316 317 if (SIInstrInfo::isSMRD(*MI)) 318 return std::max(WaitStates, checkSMRDHazards(MI)); 319 320 if (ST.hasNSAtoVMEMBug()) 321 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 322 323 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 324 325 if (ST.hasNoDataDepHazard()) 326 return WaitStates; 327 328 if (SIInstrInfo::isVMEM(*MI)) 329 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 330 331 if (SIInstrInfo::isVALU(*MI)) 332 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 333 334 if (SIInstrInfo::isDPP(*MI)) 335 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 336 337 if (isDivFMas(MI->getOpcode())) 338 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 339 340 if (isRWLane(MI->getOpcode())) 341 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 342 343 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 344 SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) && 345 checkMAIVALUHazards(MI) > 0) 346 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 347 348 if (MI->isInlineAsm()) 349 return std::max(WaitStates, checkInlineAsmHazards(MI)); 350 351 if (isSGetReg(MI->getOpcode())) 352 return std::max(WaitStates, checkGetRegHazards(MI)); 353 354 if (isSSetReg(MI->getOpcode())) 355 return std::max(WaitStates, checkSetRegHazards(MI)); 356 357 if (isRFE(MI->getOpcode())) 358 return std::max(WaitStates, checkRFEHazards(MI)); 359 360 if ((ST.hasReadM0MovRelInterpHazard() && 361 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 362 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 363 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 364 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 365 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 366 (ST.hasReadM0LdsDirectHazard() && 367 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 368 return std::max(WaitStates, checkReadM0Hazards(MI)); 369 370 if (SIInstrInfo::isMAI(*MI)) 371 return std::max(WaitStates, checkMAIHazards(MI)); 372 373 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI)) 374 return std::max(WaitStates, checkMAILdStHazards(MI)); 375 376 if (ST.hasGFX950Insts() && isPermlane(*MI)) 377 return std::max(WaitStates, checkPermlaneHazards(MI)); 378 379 return WaitStates; 380 } 381 382 void GCNHazardRecognizer::EmitNoop() { 383 EmittedInstrs.push_front(nullptr); 384 } 385 386 void GCNHazardRecognizer::AdvanceCycle() { 387 // When the scheduler detects a stall, it will call AdvanceCycle() without 388 // emitting any instructions. 389 if (!CurrCycleInstr) { 390 EmittedInstrs.push_front(nullptr); 391 return; 392 } 393 394 if (CurrCycleInstr->isBundle()) { 395 processBundle(); 396 return; 397 } 398 399 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 400 if (!NumWaitStates) { 401 CurrCycleInstr = nullptr; 402 return; 403 } 404 405 // Keep track of emitted instructions 406 EmittedInstrs.push_front(CurrCycleInstr); 407 408 // Add a nullptr for each additional wait state after the first. Make sure 409 // not to add more than getMaxLookAhead() items to the list, since we 410 // truncate the list to that size right after this loop. 411 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 412 i < e; ++i) { 413 EmittedInstrs.push_front(nullptr); 414 } 415 416 // getMaxLookahead() is the largest number of wait states we will ever need 417 // to insert, so there is no point in keeping track of more than that many 418 // wait states. 419 EmittedInstrs.resize(getMaxLookAhead()); 420 421 CurrCycleInstr = nullptr; 422 } 423 424 void GCNHazardRecognizer::RecedeCycle() { 425 assert(!IsHazardRecognizerMode && 426 "Bottom-up scheduling shouldn't run in hazard recognizer mode"); 427 } 428 429 //===----------------------------------------------------------------------===// 430 // Helper Functions 431 //===----------------------------------------------------------------------===// 432 433 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 434 435 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 436 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 437 438 // Search for a hazard in a block and its predecessors. 439 template <typename StateT> 440 static bool 441 hasHazard(StateT State, 442 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 443 function_ref<void(StateT &, const MachineInstr &)> UpdateState, 444 const MachineBasicBlock *MBB, 445 MachineBasicBlock::const_reverse_instr_iterator I, 446 DenseSet<const MachineBasicBlock *> &Visited) { 447 for (auto E = MBB->instr_rend(); I != E; ++I) { 448 // No need to look at parent BUNDLE instructions. 449 if (I->isBundle()) 450 continue; 451 452 switch (IsHazard(State, *I)) { 453 case HazardFound: 454 return true; 455 case HazardExpired: 456 return false; 457 default: 458 // Continue search 459 break; 460 } 461 462 if (I->isInlineAsm() || I->isMetaInstruction()) 463 continue; 464 465 UpdateState(State, *I); 466 } 467 468 for (MachineBasicBlock *Pred : MBB->predecessors()) { 469 if (!Visited.insert(Pred).second) 470 continue; 471 472 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 473 Visited)) 474 return true; 475 } 476 477 return false; 478 } 479 480 // Returns a minimum wait states since \p I walking all predecessors. 481 // Only scans until \p IsExpired does not return true. 482 // Can only be run in a hazard recognizer mode. 483 static int getWaitStatesSince( 484 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 485 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 486 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 487 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 488 for (auto E = MBB->instr_rend(); I != E; ++I) { 489 // Don't add WaitStates for parent BUNDLE instructions. 490 if (I->isBundle()) 491 continue; 492 493 if (IsHazard(*I)) 494 return WaitStates; 495 496 if (I->isInlineAsm()) 497 continue; 498 499 WaitStates += GetNumWaitStates(*I); 500 501 if (IsExpired(*I, WaitStates)) 502 return std::numeric_limits<int>::max(); 503 } 504 505 int MinWaitStates = std::numeric_limits<int>::max(); 506 for (MachineBasicBlock *Pred : MBB->predecessors()) { 507 if (!Visited.insert(Pred).second) 508 continue; 509 510 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 511 IsExpired, Visited, GetNumWaitStates); 512 513 MinWaitStates = std::min(MinWaitStates, W); 514 } 515 516 return MinWaitStates; 517 } 518 519 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 520 const MachineInstr *MI, IsExpiredFn IsExpired) { 521 DenseSet<const MachineBasicBlock *> Visited; 522 return getWaitStatesSince(IsHazard, MI->getParent(), 523 std::next(MI->getReverseIterator()), 524 0, IsExpired, Visited); 525 } 526 527 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 528 if (IsHazardRecognizerMode) { 529 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 530 return WaitStates >= Limit; 531 }; 532 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 533 } 534 535 int WaitStates = 0; 536 for (MachineInstr *MI : EmittedInstrs) { 537 if (MI) { 538 if (IsHazard(*MI)) 539 return WaitStates; 540 541 if (MI->isInlineAsm()) 542 continue; 543 } 544 ++WaitStates; 545 546 if (WaitStates >= Limit) 547 break; 548 } 549 return std::numeric_limits<int>::max(); 550 } 551 552 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 553 IsHazardFn IsHazardDef, 554 int Limit) { 555 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 556 557 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 558 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 559 }; 560 561 return getWaitStatesSince(IsHazardFn, Limit); 562 } 563 564 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 565 int Limit) { 566 auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 567 return isSSetReg(MI.getOpcode()) && IsHazard(MI); 568 }; 569 570 return getWaitStatesSince(IsHazardFn, Limit); 571 } 572 573 //===----------------------------------------------------------------------===// 574 // No-op Hazard Detection 575 //===----------------------------------------------------------------------===// 576 577 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 578 MCRegister Reg) { 579 for (MCRegUnit Unit : TRI.regunits(Reg)) 580 BV.set(Unit); 581 } 582 583 static void addRegsToSet(const SIRegisterInfo &TRI, 584 iterator_range<MachineInstr::const_mop_iterator> Ops, 585 BitVector &DefSet, BitVector &UseSet) { 586 for (const MachineOperand &Op : Ops) { 587 if (Op.isReg()) 588 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 589 } 590 } 591 592 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 593 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 594 } 595 596 static bool breaksSMEMSoftClause(MachineInstr *MI) { 597 return !SIInstrInfo::isSMRD(*MI); 598 } 599 600 static bool breaksVMEMSoftClause(MachineInstr *MI) { 601 return !SIInstrInfo::isVMEM(*MI); 602 } 603 604 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 605 // SMEM soft clause are only present on VI+, and only matter if xnack is 606 // enabled. 607 if (!ST.isXNACKEnabled()) 608 return 0; 609 610 bool IsSMRD = TII.isSMRD(*MEM); 611 612 resetClause(); 613 614 // A soft-clause is any group of consecutive SMEM instructions. The 615 // instructions in this group may return out of order and/or may be 616 // replayed (i.e. the same instruction issued more than once). 617 // 618 // In order to handle these situations correctly we need to make sure that 619 // when a clause has more than one instruction, no instruction in the clause 620 // writes to a register that is read by another instruction in the clause 621 // (including itself). If we encounter this situation, we need to break the 622 // clause by inserting a non SMEM instruction. 623 624 for (MachineInstr *MI : EmittedInstrs) { 625 // When we hit a non-SMEM instruction then we have passed the start of the 626 // clause and we can stop. 627 if (!MI) 628 break; 629 630 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 631 break; 632 633 addClauseInst(*MI); 634 } 635 636 if (ClauseDefs.none()) 637 return 0; 638 639 // We need to make sure not to put loads and stores in the same clause if they 640 // use the same address. For now, just start a new clause whenever we see a 641 // store. 642 if (MEM->mayStore()) 643 return 1; 644 645 addClauseInst(*MEM); 646 647 // If the set of defs and uses intersect then we cannot add this instruction 648 // to the clause, so we have a hazard. 649 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 650 } 651 652 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 653 int WaitStatesNeeded = 0; 654 655 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 656 657 // This SMRD hazard only affects SI. 658 if (!ST.hasSMRDReadVALUDefHazard()) 659 return WaitStatesNeeded; 660 661 // A read of an SGPR by SMRD instruction requires 4 wait states when the 662 // SGPR was written by a VALU instruction. 663 int SmrdSgprWaitStates = 4; 664 auto IsHazardDefFn = [this](const MachineInstr &MI) { 665 return TII.isVALU(MI); 666 }; 667 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 668 return TII.isSALU(MI); 669 }; 670 671 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 672 673 for (const MachineOperand &Use : SMRD->uses()) { 674 if (!Use.isReg()) 675 continue; 676 int WaitStatesNeededForUse = 677 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 678 SmrdSgprWaitStates); 679 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 680 681 // This fixes what appears to be undocumented hardware behavior in SI where 682 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 683 // needs some number of nops in between. We don't know how many we need, but 684 // let's use 4. This wasn't discovered before probably because the only 685 // case when this happens is when we expand a 64-bit pointer into a full 686 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 687 // probably never encountered in the closed-source land. 688 if (IsBufferSMRD) { 689 int WaitStatesNeededForUse = 690 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 691 IsBufferHazardDefFn, 692 SmrdSgprWaitStates); 693 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 694 } 695 } 696 697 return WaitStatesNeeded; 698 } 699 700 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 701 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 702 return 0; 703 704 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 705 706 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 707 // SGPR was written by a VALU Instruction. 708 const int VmemSgprWaitStates = 5; 709 auto IsHazardDefFn = [this](const MachineInstr &MI) { 710 return TII.isVALU(MI); 711 }; 712 for (const MachineOperand &Use : VMEM->uses()) { 713 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 714 continue; 715 716 int WaitStatesNeededForUse = 717 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 718 VmemSgprWaitStates); 719 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 720 } 721 return WaitStatesNeeded; 722 } 723 724 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 725 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 726 const SIInstrInfo *TII = ST.getInstrInfo(); 727 728 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 729 int DppVgprWaitStates = 2; 730 int DppExecWaitStates = 5; 731 int WaitStatesNeeded = 0; 732 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 733 return TII->isVALU(MI); 734 }; 735 736 for (const MachineOperand &Use : DPP->uses()) { 737 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 738 continue; 739 int WaitStatesNeededForUse = 740 DppVgprWaitStates - getWaitStatesSinceDef( 741 Use.getReg(), 742 [](const MachineInstr &) { return true; }, 743 DppVgprWaitStates); 744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 745 } 746 747 WaitStatesNeeded = std::max( 748 WaitStatesNeeded, 749 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 750 DppExecWaitStates)); 751 752 return WaitStatesNeeded; 753 } 754 755 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 756 const SIInstrInfo *TII = ST.getInstrInfo(); 757 758 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 759 // instruction. 760 const int DivFMasWaitStates = 4; 761 auto IsHazardDefFn = [TII](const MachineInstr &MI) { 762 return TII->isVALU(MI); 763 }; 764 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 765 DivFMasWaitStates); 766 767 return DivFMasWaitStates - WaitStatesNeeded; 768 } 769 770 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 771 const SIInstrInfo *TII = ST.getInstrInfo(); 772 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 773 774 const int GetRegWaitStates = 2; 775 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 776 return GetRegHWReg == getHWReg(TII, MI); 777 }; 778 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 779 780 return GetRegWaitStates - WaitStatesNeeded; 781 } 782 783 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 784 const SIInstrInfo *TII = ST.getInstrInfo(); 785 unsigned HWReg = getHWReg(TII, *SetRegInstr); 786 787 const int SetRegWaitStates = ST.getSetRegWaitStates(); 788 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 789 return HWReg == getHWReg(TII, MI); 790 }; 791 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 792 return SetRegWaitStates - WaitStatesNeeded; 793 } 794 795 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 796 if (!MI.mayStore()) 797 return -1; 798 799 const SIInstrInfo *TII = ST.getInstrInfo(); 800 unsigned Opcode = MI.getOpcode(); 801 const MCInstrDesc &Desc = MI.getDesc(); 802 803 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 804 int VDataRCID = -1; 805 if (VDataIdx != -1) 806 VDataRCID = Desc.operands()[VDataIdx].RegClass; 807 808 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 809 // There is no hazard if the instruction does not use vector regs 810 // (like wbinvl1) 811 if (VDataIdx == -1) 812 return -1; 813 // For MUBUF/MTBUF instructions this hazard only exists if the 814 // instruction is not using a register in the soffset field. 815 const MachineOperand *SOffset = 816 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 817 // If we have no soffset operand, then assume this field has been 818 // hardcoded to zero. 819 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 820 (!SOffset || !SOffset->isReg())) 821 return VDataIdx; 822 } 823 824 // MIMG instructions create a hazard if they don't use a 256-bit T# and 825 // the store size is greater than 8 bytes and they have more than two bits 826 // of their dmask set. 827 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 828 if (TII->isMIMG(MI)) { 829 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 830 assert(SRsrcIdx != -1 && 831 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 832 (void)SRsrcIdx; 833 } 834 835 if (TII->isFLAT(MI)) { 836 // There is no hazard if the instruction does not use vector regs 837 if (VDataIdx == -1) 838 return -1; 839 840 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) 841 return VDataIdx; 842 } 843 844 return -1; 845 } 846 847 int 848 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 849 const MachineRegisterInfo &MRI) { 850 // Helper to check for the hazard where VMEM instructions that store more than 851 // 8 bytes can have there store data over written by the next instruction. 852 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 853 854 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 855 int WaitStatesNeeded = 0; 856 857 if (!TRI->isVectorRegister(MRI, Def.getReg())) 858 return WaitStatesNeeded; 859 Register Reg = Def.getReg(); 860 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 861 int DataIdx = createsVALUHazard(MI); 862 return DataIdx >= 0 && 863 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 864 }; 865 866 int WaitStatesNeededForDef = 867 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 868 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 869 870 return WaitStatesNeeded; 871 } 872 873 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle / 874 /// pack the computed value into correct bit position of the dest register. This 875 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with 876 /// dst_sel that is not aligned to the register. This function analayzes the \p 877 /// MI and \returns an operand with dst forwarding issue, or nullptr if 878 /// none exists. 879 static const MachineOperand * 880 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { 881 if (!SIInstrInfo::isVALU(MI)) 882 return nullptr; 883 884 const SIInstrInfo *TII = ST.getInstrInfo(); 885 886 unsigned Opcode = MI.getOpcode(); 887 888 // There are three different types of instructions 889 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 890 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst 891 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and 892 // op_sel[3:2] 893 // != 0 894 if (SIInstrInfo::isSDWA(MI)) { 895 // Type 1: SDWA with dst_sel != DWORD 896 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 897 if (DstSel->getImm() != AMDGPU::SDWA::DWORD) 898 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 899 } 900 901 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode); 902 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) { 903 // Type 2: VOP3 which write the hi bits 904 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) & 905 SISrcMods::DST_OP_SEL) 906 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 907 908 // Type 3: FP8DstSelInst with op_sel[3:2] != 0) 909 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 && 910 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) & 911 SISrcMods::OP_SEL_0)) 912 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 913 } 914 915 // Special case: nop is required for all the opsel values for fp4 sr variant 916 // cvt scale instructions 917 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4) 918 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 919 920 return nullptr; 921 } 922 923 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel 924 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit 925 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) 926 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, 927 const MachineOperand *Dst, 928 const SIRegisterInfo *TRI) { 929 // We must consider implicit reads of the VALU. SDWA with dst_sel and 930 // UNUSED_PRESERVE will implicitly read the result from forwarded dest, 931 // and we must account for that hazard. 932 // We also must account for WAW hazards. In particular, WAW with dest 933 // preserve semantics (e.g. VOP3 with op_sel, VOP2 && 934 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity 935 // check for ECC. Without accounting for this hazard, the ECC will be 936 // wrong. 937 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. 938 // complete zeroesHigh16BitsOfDest) 939 for (auto &Operand : VALU->operands()) { 940 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { 941 return true; 942 } 943 } 944 return false; 945 } 946 947 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 948 int WaitStatesNeeded = 0; 949 950 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 951 const int TransDefWaitstates = 1; 952 953 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 954 if (!SIInstrInfo::isTRANS(MI)) 955 return false; 956 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 957 const SIInstrInfo *TII = ST.getInstrInfo(); 958 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 959 960 for (const MachineOperand &Use : VALU->explicit_uses()) { 961 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 962 return true; 963 } 964 965 return false; 966 }; 967 968 int WaitStatesNeededForDef = 969 TransDefWaitstates - 970 getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 971 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 972 } 973 974 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) { 975 const int Shift16DefWaitstates = 1; 976 977 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { 978 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 979 const MachineOperand *ForwardedDst = 980 getDstSelForwardingOperand(ProducerMI, ST); 981 if (ForwardedDst) { 982 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); 983 } 984 985 if (ProducerMI.isInlineAsm()) { 986 // Assume inline asm has dst forwarding hazard 987 for (auto &Def : ProducerMI.all_defs()) { 988 if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) 989 return true; 990 } 991 } 992 993 return false; 994 }; 995 996 int WaitStatesNeededForDef = 997 Shift16DefWaitstates - 998 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1000 } 1001 1002 if (ST.hasVDecCoExecHazard()) { 1003 const int VALUWriteSGPRVALUReadWaitstates = 2; 1004 const int VALUWriteEXECRWLane = 4; 1005 const int VALUWriteVGPRReadlaneRead = 1; 1006 1007 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1008 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1009 Register UseReg; 1010 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 1011 if (!SIInstrInfo::isVALU(MI)) 1012 return false; 1013 return MI.modifiesRegister(UseReg, TRI); 1014 }; 1015 1016 for (const MachineOperand &Use : VALU->explicit_uses()) { 1017 if (!Use.isReg()) 1018 continue; 1019 1020 UseReg = Use.getReg(); 1021 if (TRI->isSGPRReg(MRI, UseReg)) { 1022 int WaitStatesNeededForDef = 1023 VALUWriteSGPRVALUReadWaitstates - 1024 getWaitStatesSince(IsVALUDefSGPRFn, 1025 VALUWriteSGPRVALUReadWaitstates); 1026 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1027 } 1028 } 1029 1030 if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 1031 UseReg = AMDGPU::VCC; 1032 int WaitStatesNeededForDef = 1033 VALUWriteSGPRVALUReadWaitstates - 1034 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 1035 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1036 } 1037 1038 switch (VALU->getOpcode()) { 1039 case AMDGPU::V_READLANE_B32: 1040 case AMDGPU::V_READFIRSTLANE_B32: { 1041 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 1042 UseReg = Src->getReg(); 1043 int WaitStatesNeededForDef = 1044 VALUWriteVGPRReadlaneRead - 1045 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 1046 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1047 } 1048 [[fallthrough]]; 1049 case AMDGPU::V_WRITELANE_B32: { 1050 UseReg = AMDGPU::EXEC; 1051 int WaitStatesNeededForDef = 1052 VALUWriteEXECRWLane - 1053 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 1054 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1055 break; 1056 } 1057 default: 1058 break; 1059 } 1060 } 1061 1062 // This checks for the hazard where VMEM instructions that store more than 1063 // 8 bytes can have there store data over written by the next instruction. 1064 if (!ST.has12DWordStoreHazard()) 1065 return WaitStatesNeeded; 1066 1067 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1068 1069 for (const MachineOperand &Def : VALU->defs()) { 1070 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 1071 } 1072 1073 return WaitStatesNeeded; 1074 } 1075 1076 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 1077 // This checks for hazards associated with inline asm statements. 1078 // Since inline asms can contain just about anything, we use this 1079 // to call/leverage other check*Hazard routines. Note that 1080 // this function doesn't attempt to address all possible inline asm 1081 // hazards (good luck), but is a collection of what has been 1082 // problematic thus far. 1083 1084 // see checkVALUHazards() 1085 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() && 1086 !ST.hasCvtScaleForwardingHazard()) 1087 return 0; 1088 1089 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1090 int WaitStatesNeeded = 0; 1091 1092 for (const MachineOperand &Op : 1093 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 1094 if (Op.isReg() && Op.isDef()) { 1095 if (!TRI.isVectorRegister(MRI, Op.getReg())) 1096 continue; 1097 1098 if (ST.has12DWordStoreHazard()) { 1099 WaitStatesNeeded = 1100 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 1101 } 1102 } 1103 } 1104 1105 if (ST.hasDstSelForwardingHazard()) { 1106 const int Shift16DefWaitstates = 1; 1107 1108 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { 1109 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); 1110 // Assume inline asm reads the dst 1111 if (Dst) 1112 return IA->modifiesRegister(Dst->getReg(), &TRI) || 1113 IA->readsRegister(Dst->getReg(), &TRI); 1114 1115 if (ProducerMI.isInlineAsm()) { 1116 // If MI is inline asm, assume it has dst forwarding hazard 1117 for (auto &Def : ProducerMI.all_defs()) { 1118 if (IA->modifiesRegister(Def.getReg(), &TRI) || 1119 IA->readsRegister(Def.getReg(), &TRI)) { 1120 return true; 1121 } 1122 } 1123 } 1124 1125 return false; 1126 }; 1127 1128 int WaitStatesNeededForDef = 1129 Shift16DefWaitstates - 1130 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 1131 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 1132 } 1133 1134 return WaitStatesNeeded; 1135 } 1136 1137 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 1138 const SIInstrInfo *TII = ST.getInstrInfo(); 1139 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1140 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1141 1142 const MachineOperand *LaneSelectOp = 1143 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 1144 1145 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 1146 return 0; 1147 1148 Register LaneSelectReg = LaneSelectOp->getReg(); 1149 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 1150 1151 const int RWLaneWaitStates = 4; 1152 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 1153 RWLaneWaitStates); 1154 return RWLaneWaitStates - WaitStatesSince; 1155 } 1156 1157 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 1158 if (!ST.hasRFEHazards()) 1159 return 0; 1160 1161 const SIInstrInfo *TII = ST.getInstrInfo(); 1162 1163 const int RFEWaitStates = 1; 1164 1165 auto IsHazardFn = [TII](const MachineInstr &MI) { 1166 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 1167 }; 1168 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 1169 return RFEWaitStates - WaitStatesNeeded; 1170 } 1171 1172 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 1173 const SIInstrInfo *TII = ST.getInstrInfo(); 1174 const int ReadM0WaitStates = 1; 1175 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 1176 return ReadM0WaitStates - 1177 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 1178 } 1179 1180 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 1181 fixVMEMtoScalarWriteHazards(MI); 1182 fixVcmpxPermlaneHazards(MI); 1183 fixSMEMtoVectorWriteHazards(MI); 1184 fixVcmpxExecWARHazard(MI); 1185 fixLdsBranchVmemWARHazard(MI); 1186 if (ST.hasLdsDirect()) { 1187 fixLdsDirectVALUHazard(MI); 1188 fixLdsDirectVMEMHazard(MI); 1189 } 1190 fixVALUPartialForwardingHazard(MI); 1191 fixVALUTransUseHazard(MI); 1192 fixWMMAHazards(MI); 1193 fixShift64HighRegBug(MI); 1194 fixVALUMaskWriteHazard(MI); 1195 fixRequiredExportPriority(MI); 1196 } 1197 1198 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, 1199 const MachineInstr &MI) { 1200 return (TII.isVOPC(MI) || 1201 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) && 1202 MI.modifiesRegister(AMDGPU::EXEC, &TRI); 1203 } 1204 1205 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 1206 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 1207 return false; 1208 1209 const SIInstrInfo *TII = ST.getInstrInfo(); 1210 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1211 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 1212 return isVCmpXWritesExec(*TII, *TRI, MI); 1213 }; 1214 1215 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1216 unsigned Opc = MI.getOpcode(); 1217 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1218 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 1219 }; 1220 1221 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1222 std::numeric_limits<int>::max()) 1223 return false; 1224 1225 // V_NOP will be discarded by SQ. 1226 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 1227 // which is always a VGPR and available. 1228 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 1229 Register Reg = Src0->getReg(); 1230 bool IsUndef = Src0->isUndef(); 1231 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1232 TII->get(AMDGPU::V_MOV_B32_e32)) 1233 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 1234 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 1235 1236 return true; 1237 } 1238 1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 1240 if (!ST.hasVMEMtoScalarWriteHazard()) 1241 return false; 1242 assert(!ST.hasExtendedWaitCounts()); 1243 1244 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 1245 return false; 1246 1247 if (MI->getNumDefs() == 0) 1248 return false; 1249 1250 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1251 1252 auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1253 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I)) 1254 return false; 1255 1256 for (const MachineOperand &Def : MI->defs()) { 1257 const MachineOperand *Op = 1258 I.findRegisterUseOperand(Def.getReg(), TRI, false); 1259 if (!Op) 1260 continue; 1261 return true; 1262 } 1263 return false; 1264 }; 1265 1266 auto IsExpiredFn = [](const MachineInstr &MI, int) { 1267 return SIInstrInfo::isVALU(MI) || 1268 (MI.getOpcode() == AMDGPU::S_WAITCNT && 1269 !MI.getOperand(0).getImm()) || 1270 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1271 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 1272 }; 1273 1274 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1275 std::numeric_limits<int>::max()) 1276 return false; 1277 1278 const SIInstrInfo *TII = ST.getInstrInfo(); 1279 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1280 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1281 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1282 return true; 1283 } 1284 1285 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 1286 if (!ST.hasSMEMtoVectorWriteHazard()) 1287 return false; 1288 assert(!ST.hasExtendedWaitCounts()); 1289 1290 if (!SIInstrInfo::isVALU(*MI)) 1291 return false; 1292 1293 AMDGPU::OpName SDSTName; 1294 switch (MI->getOpcode()) { 1295 case AMDGPU::V_READLANE_B32: 1296 case AMDGPU::V_READFIRSTLANE_B32: 1297 SDSTName = AMDGPU::OpName::vdst; 1298 break; 1299 default: 1300 SDSTName = AMDGPU::OpName::sdst; 1301 break; 1302 } 1303 1304 const SIInstrInfo *TII = ST.getInstrInfo(); 1305 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1306 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 1307 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 1308 if (!SDST) { 1309 for (const auto &MO : MI->implicit_operands()) { 1310 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 1311 SDST = &MO; 1312 break; 1313 } 1314 } 1315 } 1316 1317 if (!SDST) 1318 return false; 1319 1320 const Register SDSTReg = SDST->getReg(); 1321 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1322 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 1323 }; 1324 1325 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1326 if (TII->isSALU(MI)) { 1327 switch (MI.getOpcode()) { 1328 case AMDGPU::S_SETVSKIP: 1329 case AMDGPU::S_VERSION: 1330 case AMDGPU::S_WAITCNT_VSCNT: 1331 case AMDGPU::S_WAITCNT_VMCNT: 1332 case AMDGPU::S_WAITCNT_EXPCNT: 1333 // These instructions cannot not mitigate the hazard. 1334 return false; 1335 case AMDGPU::S_WAITCNT_LGKMCNT: 1336 // Reducing lgkmcnt count to 0 always mitigates the hazard. 1337 return (MI.getOperand(1).getImm() == 0) && 1338 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1339 case AMDGPU::S_WAITCNT: { 1340 const int64_t Imm = MI.getOperand(0).getImm(); 1341 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1342 // DsCnt corresponds to LGKMCnt here. 1343 return (Decoded.DsCnt == 0); 1344 } 1345 default: 1346 // SOPP instructions cannot mitigate the hazard. 1347 if (TII->isSOPP(MI)) 1348 return false; 1349 // At this point the SALU can be assumed to mitigate the hazard 1350 // because either: 1351 // (a) it is independent of the at risk SMEM (breaking chain), 1352 // or 1353 // (b) it is dependent on the SMEM, in which case an appropriate 1354 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1355 // SMEM instruction. 1356 return true; 1357 } 1358 } 1359 return false; 1360 }; 1361 1362 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1363 std::numeric_limits<int>::max()) 1364 return false; 1365 1366 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1367 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1368 .addImm(0); 1369 return true; 1370 } 1371 1372 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1373 if (!ST.hasVcmpxExecWARHazard()) 1374 return false; 1375 assert(!ST.hasExtendedWaitCounts()); 1376 1377 if (!SIInstrInfo::isVALU(*MI)) 1378 return false; 1379 1380 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1381 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1382 return false; 1383 1384 auto IsHazardFn = [TRI](const MachineInstr &I) { 1385 if (SIInstrInfo::isVALU(I)) 1386 return false; 1387 return I.readsRegister(AMDGPU::EXEC, TRI); 1388 }; 1389 1390 const SIInstrInfo *TII = ST.getInstrInfo(); 1391 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1392 if (SIInstrInfo::isVALU(MI)) { 1393 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 1394 return true; 1395 for (auto MO : MI.implicit_operands()) 1396 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 1397 return true; 1398 } 1399 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1400 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 1401 return true; 1402 return false; 1403 }; 1404 1405 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1406 std::numeric_limits<int>::max()) 1407 return false; 1408 1409 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1410 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1411 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 1412 return true; 1413 } 1414 1415 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1416 const GCNSubtarget &ST) { 1417 if (!ST.hasLdsBranchVmemWARHazard()) 1418 return false; 1419 1420 // Check if the necessary condition for the hazard is met: both LDS and VMEM 1421 // instructions need to appear in the same function. 1422 bool HasLds = false; 1423 bool HasVmem = false; 1424 for (auto &MBB : MF) { 1425 for (auto &MI : MBB) { 1426 HasLds |= SIInstrInfo::isDS(MI); 1427 HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || 1428 SIInstrInfo::isSegmentSpecificFLAT(MI); 1429 if (HasLds && HasVmem) 1430 return true; 1431 } 1432 } 1433 return false; 1434 } 1435 1436 static bool isStoreCountWaitZero(const MachineInstr &I) { 1437 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1438 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1439 !I.getOperand(1).getImm(); 1440 } 1441 1442 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1443 if (!RunLdsBranchVmemWARHazardFixup) 1444 return false; 1445 1446 assert(ST.hasLdsBranchVmemWARHazard()); 1447 assert(!ST.hasExtendedWaitCounts()); 1448 1449 auto IsHazardInst = [](const MachineInstr &MI) { 1450 if (SIInstrInfo::isDS(MI)) 1451 return 1; 1452 if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || 1453 SIInstrInfo::isSegmentSpecificFLAT(MI)) 1454 return 2; 1455 return 0; 1456 }; 1457 1458 auto InstType = IsHazardInst(*MI); 1459 if (!InstType) 1460 return false; 1461 1462 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1463 return IsHazardInst(I) || isStoreCountWaitZero(I); 1464 }; 1465 1466 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1467 if (!I.isBranch()) 1468 return false; 1469 1470 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 1471 auto InstType2 = IsHazardInst(I); 1472 return InstType2 && InstType != InstType2; 1473 }; 1474 1475 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 1476 auto InstType2 = IsHazardInst(I); 1477 if (InstType == InstType2) 1478 return true; 1479 1480 return isStoreCountWaitZero(I); 1481 }; 1482 1483 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 1484 std::numeric_limits<int>::max(); 1485 }; 1486 1487 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1488 std::numeric_limits<int>::max()) 1489 return false; 1490 1491 const SIInstrInfo *TII = ST.getInstrInfo(); 1492 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1493 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1494 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1495 .addImm(0); 1496 1497 return true; 1498 } 1499 1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 1501 if (!SIInstrInfo::isLDSDIR(*MI)) 1502 return false; 1503 1504 const int NoHazardWaitStates = 15; 1505 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1506 const Register VDSTReg = VDST->getReg(); 1507 1508 bool VisitedTrans = false; 1509 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 1510 if (!SIInstrInfo::isVALU(I)) 1511 return false; 1512 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 1513 // Cover both WAR and WAW 1514 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1515 }; 1516 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 1517 if (WaitStates >= NoHazardWaitStates) 1518 return true; 1519 // Instructions which cause va_vdst==0 expire hazard 1520 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || 1521 SIInstrInfo::isEXP(I); 1522 }; 1523 auto GetWaitStatesFn = [](const MachineInstr &MI) { 1524 return SIInstrInfo::isVALU(MI) ? 1 : 0; 1525 }; 1526 1527 DenseSet<const MachineBasicBlock *> Visited; 1528 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 1529 std::next(MI->getReverseIterator()), 0, 1530 IsExpiredFn, Visited, GetWaitStatesFn); 1531 1532 // Transcendentals can execute in parallel to other VALUs. 1533 // This makes va_vdst count unusable with a mixture of VALU and TRANS. 1534 if (VisitedTrans) 1535 Count = 0; 1536 1537 MachineOperand *WaitVdstOp = 1538 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 1539 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 1540 1541 return true; 1542 } 1543 1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 1545 if (!SIInstrInfo::isLDSDIR(*MI)) 1546 return false; 1547 1548 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 1549 const Register VDSTReg = VDST->getReg(); 1550 1551 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 1552 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I)) 1553 return false; 1554 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 1555 }; 1556 bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 1557 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 1558 // according to the type of VMEM instruction. 1559 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 1560 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 1561 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 1562 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1563 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1564 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1565 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 1566 }; 1567 1568 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1569 std::numeric_limits<int>::max()) 1570 return false; 1571 1572 if (LdsdirCanWait) { 1573 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1574 } else { 1575 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1576 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1577 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1578 } 1579 1580 return true; 1581 } 1582 1583 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 1584 if (!ST.hasVALUPartialForwardingHazard()) 1585 return false; 1586 assert(!ST.hasExtendedWaitCounts()); 1587 1588 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 1589 return false; 1590 1591 SmallSetVector<Register, 4> SrcVGPRs; 1592 1593 for (const MachineOperand &Use : MI->explicit_uses()) { 1594 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1595 SrcVGPRs.insert(Use.getReg()); 1596 } 1597 1598 // Only applies with >= 2 unique VGPR sources 1599 if (SrcVGPRs.size() <= 1) 1600 return false; 1601 1602 // Look for the following pattern: 1603 // Va <- VALU [PreExecPos] 1604 // intv1 1605 // Exec <- SALU [ExecPos] 1606 // intv2 1607 // Vb <- VALU [PostExecPos] 1608 // intv3 1609 // MI Va, Vb (WaitState = 0) 1610 // 1611 // Where: 1612 // intv1 + intv2 <= 2 VALUs 1613 // intv3 <= 4 VALUs 1614 // 1615 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1616 1617 const int Intv1plus2MaxVALUs = 2; 1618 const int Intv3MaxVALUs = 4; 1619 const int IntvMaxVALUs = 6; 1620 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 1621 1622 struct StateType { 1623 SmallDenseMap<Register, int, 4> DefPos; 1624 int ExecPos = std::numeric_limits<int>::max(); 1625 int VALUs = 0; 1626 }; 1627 1628 StateType State; 1629 1630 // This overloads expiry testing with all the hazard detection 1631 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1632 // Too many VALU states have passed 1633 if (State.VALUs > NoHazardVALUWaitStates) 1634 return HazardExpired; 1635 1636 // Instructions which cause va_vdst==0 expire hazard 1637 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || 1638 SIInstrInfo::isEXP(I) || 1639 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1640 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 1641 return HazardExpired; 1642 1643 // Track registers writes 1644 bool Changed = false; 1645 if (SIInstrInfo::isVALU(I)) { 1646 for (Register Src : SrcVGPRs) { 1647 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 1648 State.DefPos[Src] = State.VALUs; 1649 Changed = true; 1650 } 1651 } 1652 } else if (SIInstrInfo::isSALU(I)) { 1653 if (State.ExecPos == std::numeric_limits<int>::max()) { 1654 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 1655 State.ExecPos = State.VALUs; 1656 Changed = true; 1657 } 1658 } 1659 } 1660 1661 // Early expiration: too many VALUs in intv3 1662 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 1663 return HazardExpired; 1664 1665 // Only evaluate state if something changed 1666 if (!Changed) 1667 return NoHazardFound; 1668 1669 // Determine positions of VALUs pre/post exec change 1670 if (State.ExecPos == std::numeric_limits<int>::max()) 1671 return NoHazardFound; 1672 1673 int PreExecPos = std::numeric_limits<int>::max(); 1674 int PostExecPos = std::numeric_limits<int>::max(); 1675 1676 for (auto Entry : State.DefPos) { 1677 int DefVALUs = Entry.second; 1678 if (DefVALUs != std::numeric_limits<int>::max()) { 1679 if (DefVALUs >= State.ExecPos) 1680 PreExecPos = std::min(PreExecPos, DefVALUs); 1681 else 1682 PostExecPos = std::min(PostExecPos, DefVALUs); 1683 } 1684 } 1685 1686 // Need a VALUs post exec change 1687 if (PostExecPos == std::numeric_limits<int>::max()) 1688 return NoHazardFound; 1689 1690 // Too many VALUs in intv3? 1691 int Intv3VALUs = PostExecPos; 1692 if (Intv3VALUs > Intv3MaxVALUs) 1693 return HazardExpired; 1694 1695 // Too many VALUs in intv2? 1696 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 1697 if (Intv2VALUs > Intv1plus2MaxVALUs) 1698 return HazardExpired; 1699 1700 // Need a VALUs pre exec change 1701 if (PreExecPos == std::numeric_limits<int>::max()) 1702 return NoHazardFound; 1703 1704 // Too many VALUs in intv1? 1705 int Intv1VALUs = PreExecPos - State.ExecPos; 1706 if (Intv1VALUs > Intv1plus2MaxVALUs) 1707 return HazardExpired; 1708 1709 // Too many VALUs in intv1 + intv2 1710 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 1711 return HazardExpired; 1712 1713 return HazardFound; 1714 }; 1715 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1716 if (SIInstrInfo::isVALU(MI)) 1717 State.VALUs += 1; 1718 }; 1719 1720 DenseSet<const MachineBasicBlock *> Visited; 1721 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1722 std::next(MI->getReverseIterator()), Visited)) 1723 return false; 1724 1725 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1726 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1727 .addImm(0x0fff); 1728 1729 return true; 1730 } 1731 1732 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 1733 if (!ST.hasVALUTransUseHazard()) 1734 return false; 1735 assert(!ST.hasExtendedWaitCounts()); 1736 1737 if (!SIInstrInfo::isVALU(*MI)) 1738 return false; 1739 1740 SmallSet<Register, 4> SrcVGPRs; 1741 1742 for (const MachineOperand &Use : MI->explicit_uses()) { 1743 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1744 SrcVGPRs.insert(Use.getReg()); 1745 } 1746 1747 // Look for the following pattern: 1748 // Va <- TRANS VALU 1749 // intv 1750 // MI Va (WaitState = 0) 1751 // 1752 // Where: 1753 // intv <= 5 VALUs / 1 TRANS 1754 // 1755 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 1756 1757 const int IntvMaxVALUs = 5; 1758 const int IntvMaxTRANS = 1; 1759 1760 struct StateType { 1761 int VALUs = 0; 1762 int TRANS = 0; 1763 }; 1764 1765 StateType State; 1766 1767 // This overloads expiry testing with all the hazard detection 1768 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 1769 // Too many VALU states have passed 1770 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 1771 return HazardExpired; 1772 1773 // Instructions which cause va_vdst==0 expire hazard 1774 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || 1775 SIInstrInfo::isEXP(I) || 1776 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1777 I.getOperand(0).getImm() == 0x0fff)) 1778 return HazardExpired; 1779 1780 // Track registers writes 1781 if (SIInstrInfo::isTRANS(I)) { 1782 for (Register Src : SrcVGPRs) { 1783 if (I.modifiesRegister(Src, &TRI)) { 1784 return HazardFound; 1785 } 1786 } 1787 } 1788 1789 return NoHazardFound; 1790 }; 1791 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 1792 if (SIInstrInfo::isVALU(MI)) 1793 State.VALUs += 1; 1794 if (SIInstrInfo::isTRANS(MI)) 1795 State.TRANS += 1; 1796 }; 1797 1798 DenseSet<const MachineBasicBlock *> Visited; 1799 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 1800 std::next(MI->getReverseIterator()), Visited)) 1801 return false; 1802 1803 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 1804 // avoided. 1805 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1806 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 1807 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 1808 1809 return true; 1810 } 1811 1812 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1813 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 1814 return false; 1815 1816 const SIInstrInfo *TII = ST.getInstrInfo(); 1817 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1818 1819 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1820 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 1821 return false; 1822 1823 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1824 // with the dest(matrix D) of the previous wmma. 1825 const Register CurSrc0Reg = 1826 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 1827 const Register CurSrc1Reg = 1828 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 1829 1830 const Register PrevDstReg = 1831 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 1832 1833 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 1834 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 1835 return true; 1836 } 1837 1838 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1839 // but Index can't overlap with PrevDstReg. 1840 if (AMDGPU::isGFX12Plus(ST)) { 1841 if (SIInstrInfo::isSWMMAC(*MI)) { 1842 const Register CurIndex = 1843 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1844 if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1845 return true; 1846 } 1847 return false; 1848 } 1849 1850 return false; 1851 }; 1852 1853 auto IsExpiredFn = [](const MachineInstr &I, int) { 1854 return SIInstrInfo::isVALU(I); 1855 }; 1856 1857 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1858 std::numeric_limits<int>::max()) 1859 return false; 1860 1861 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 1862 1863 return true; 1864 } 1865 1866 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1867 if (!ST.hasShift64HighRegBug()) 1868 return false; 1869 assert(!ST.hasExtendedWaitCounts()); 1870 1871 switch (MI->getOpcode()) { 1872 default: 1873 return false; 1874 case AMDGPU::V_LSHLREV_B64_e64: 1875 case AMDGPU::V_LSHRREV_B64_e64: 1876 case AMDGPU::V_ASHRREV_I64_e64: 1877 break; 1878 } 1879 1880 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1881 if (!Amt->isReg()) 1882 return false; 1883 1884 Register AmtReg = Amt->getReg(); 1885 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1886 // Check if this is a last VGPR in the allocation block. 1887 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1888 return false; 1889 1890 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1891 return false; 1892 1893 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1894 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1895 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1896 bool Overlapped = OverlappedSrc || OverlappedDst; 1897 1898 assert(!OverlappedDst || !OverlappedSrc || 1899 Src1->getReg() == MI->getOperand(0).getReg()); 1900 assert(ST.needsAlignedVGPRs()); 1901 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1902 1903 Register NewReg; 1904 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1905 : AMDGPU::VGPR_32RegClass) { 1906 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1907 NewReg = Reg; 1908 break; 1909 } 1910 } 1911 1912 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1913 : NewReg; 1914 Register NewAmtLo; 1915 1916 if (Overlapped) 1917 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1918 1919 DebugLoc DL = MI->getDebugLoc(); 1920 MachineBasicBlock *MBB = MI->getParent(); 1921 // Insert a full wait count because found register might be pending a wait. 1922 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1923 .addImm(0); 1924 1925 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1926 if (Overlapped) 1927 runOnInstruction( 1928 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1929 .addDef(AmtReg - 1) 1930 .addReg(AmtReg - 1, RegState::Undef) 1931 .addReg(NewAmtLo, RegState::Undef)); 1932 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1933 .addDef(AmtReg) 1934 .addReg(AmtReg, RegState::Undef) 1935 .addReg(NewAmt, RegState::Undef)); 1936 1937 // Instructions emitted after the current instruction will be processed by the 1938 // parent loop of the hazard recognizer in a natural way. 1939 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1940 AmtReg) 1941 .addDef(NewAmt) 1942 .addReg(NewAmt) 1943 .addReg(AmtReg); 1944 if (Overlapped) 1945 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1946 AmtReg - 1) 1947 .addDef(NewAmtLo) 1948 .addReg(NewAmtLo) 1949 .addReg(AmtReg - 1); 1950 1951 // Re-running hazard recognizer on the modified instruction is not necessary, 1952 // inserted V_SWAP_B32 has already both read and write new registers so 1953 // hazards related to these register has already been handled. 1954 Amt->setReg(NewAmt); 1955 Amt->setIsKill(false); 1956 // We do not update liveness, so verifier may see it as undef. 1957 Amt->setIsUndef(); 1958 if (OverlappedDst) 1959 MI->getOperand(0).setReg(NewReg); 1960 if (OverlappedSrc) { 1961 Src1->setReg(NewReg); 1962 Src1->setIsKill(false); 1963 Src1->setIsUndef(); 1964 } 1965 1966 return true; 1967 } 1968 1969 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1970 int NSAtoVMEMWaitStates = 1; 1971 1972 if (!ST.hasNSAtoVMEMBug()) 1973 return 0; 1974 1975 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1976 return 0; 1977 1978 const SIInstrInfo *TII = ST.getInstrInfo(); 1979 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1980 if (!Offset || (Offset->getImm() & 6) == 0) 1981 return 0; 1982 1983 auto IsHazardFn = [TII](const MachineInstr &I) { 1984 if (!SIInstrInfo::isMIMG(I)) 1985 return false; 1986 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 1987 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1988 TII->getInstSizeInBytes(I) >= 16; 1989 }; 1990 1991 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1992 } 1993 1994 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1995 int FPAtomicToDenormModeWaitStates = 3; 1996 1997 if (!ST.hasFPAtomicToDenormModeHazard()) 1998 return 0; 1999 assert(!ST.hasExtendedWaitCounts()); 2000 2001 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 2002 return 0; 2003 2004 auto IsHazardFn = [](const MachineInstr &I) { 2005 if (!SIInstrInfo::isVMEM(I)) 2006 return false; 2007 return SIInstrInfo::isFPAtomic(I); 2008 }; 2009 2010 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 2011 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 2012 return true; 2013 2014 switch (MI.getOpcode()) { 2015 case AMDGPU::S_WAITCNT: 2016 case AMDGPU::S_WAITCNT_VSCNT: 2017 case AMDGPU::S_WAITCNT_VMCNT: 2018 case AMDGPU::S_WAITCNT_EXPCNT: 2019 case AMDGPU::S_WAITCNT_LGKMCNT: 2020 case AMDGPU::S_WAIT_IDLE: 2021 return true; 2022 default: 2023 break; 2024 } 2025 2026 return false; 2027 }; 2028 2029 return FPAtomicToDenormModeWaitStates - 2030 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 2031 } 2032 2033 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 2034 assert(SIInstrInfo::isMAI(*MI)); 2035 2036 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 2037 } 2038 2039 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 2040 // Early exit if no padding is requested. 2041 if (MFMAPaddingRatio == 0) 2042 return 0; 2043 2044 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2045 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 2046 return 0; 2047 2048 int NeighborMFMALatency = 0; 2049 auto IsNeighboringMFMA = [&NeighborMFMALatency, 2050 this](const MachineInstr &MI) { 2051 if (!SIInstrInfo::isMFMA(MI)) 2052 return false; 2053 2054 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 2055 return true; 2056 }; 2057 2058 const int MaxMFMAPipelineWaitStates = 16; 2059 int WaitStatesSinceNeighborMFMA = 2060 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 2061 2062 int NeighborMFMAPaddingNeeded = 2063 (NeighborMFMALatency * MFMAPaddingRatio / 100) - 2064 WaitStatesSinceNeighborMFMA; 2065 2066 return std::max(0, NeighborMFMAPaddingNeeded); 2067 } 2068 2069 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 2070 int WaitStatesNeeded = 0; 2071 unsigned Opc = MI->getOpcode(); 2072 2073 auto IsVALUFn = [](const MachineInstr &MI) { 2074 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 2075 }; 2076 2077 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 2078 const int LegacyVALUWritesVGPRWaitStates = 2; 2079 const int VALUWritesExecWaitStates = 4; 2080 const int MaxWaitStates = 4; 2081 2082 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2083 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 2084 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2085 2086 if (WaitStatesNeeded < MaxWaitStates) { 2087 for (const MachineOperand &Use : MI->explicit_uses()) { 2088 const int MaxWaitStates = 2; 2089 2090 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 2091 continue; 2092 2093 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 2094 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 2095 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2096 2097 if (WaitStatesNeeded == MaxWaitStates) 2098 break; 2099 } 2100 } 2101 } 2102 2103 for (const MachineOperand &Op : MI->explicit_operands()) { 2104 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 2105 continue; 2106 2107 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2108 continue; 2109 2110 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 2111 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 2112 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 2113 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 2114 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 2115 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 2116 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 2117 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 2118 const int MaxWaitStates = 18; 2119 Register Reg = Op.getReg(); 2120 unsigned HazardDefLatency = 0; 2121 2122 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2123 this](const MachineInstr &MI) { 2124 if (!SIInstrInfo::isMFMA(MI)) 2125 return false; 2126 Register DstReg = MI.getOperand(0).getReg(); 2127 if (DstReg == Reg) 2128 return false; 2129 HazardDefLatency = 2130 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2131 return TRI.regsOverlap(DstReg, Reg); 2132 }; 2133 2134 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 2135 MaxWaitStates); 2136 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 2137 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2138 int OpNo = Op.getOperandNo(); 2139 if (OpNo == SrcCIdx) { 2140 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2141 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 2142 switch (HazardDefLatency) { 2143 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 2144 break; 2145 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 2146 break; 2147 case 16: [[fallthrough]]; 2148 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 2149 break; 2150 } 2151 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2152 switch (HazardDefLatency) { 2153 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 2154 break; 2155 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 2156 break; 2157 case 16: [[fallthrough]]; 2158 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 2159 break; 2160 } 2161 } 2162 2163 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2164 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2165 2166 if (WaitStatesNeeded == MaxWaitStates) 2167 return WaitStatesNeeded; // Early exit. 2168 2169 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2170 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2171 return false; 2172 Register DstReg = MI.getOperand(0).getReg(); 2173 return TRI.regsOverlap(Reg, DstReg); 2174 }; 2175 2176 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 2177 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 2178 const int AccVGPRWriteAccVgprReadWaitStates = 3; 2179 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 2180 if (OpNo == SrcCIdx) 2181 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2182 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 2183 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 2184 2185 WaitStatesNeededForUse = NeedWaitStates - 2186 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 2187 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2188 2189 if (WaitStatesNeeded == MaxWaitStates) 2190 return WaitStatesNeeded; // Early exit. 2191 } 2192 2193 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 2194 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 2195 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 2196 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 2197 const int MaxWaitStates = 13; 2198 Register DstReg = MI->getOperand(0).getReg(); 2199 unsigned HazardDefLatency = 0; 2200 2201 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2202 this](const MachineInstr &MI) { 2203 if (!SIInstrInfo::isMFMA(MI)) 2204 return false; 2205 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2206 HazardDefLatency = 2207 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 2208 return TRI.regsOverlap(Reg, DstReg); 2209 }; 2210 2211 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 2212 int NeedWaitStates; 2213 switch (HazardDefLatency) { 2214 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 2215 break; 2216 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 2217 break; 2218 case 16: [[fallthrough]]; 2219 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 2220 break; 2221 } 2222 2223 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 2224 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2225 } 2226 2227 // Pad neighboring MFMA with noops for better inter-wave performance. 2228 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2229 2230 return WaitStatesNeeded; 2231 } 2232 2233 static int 2234 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, 2235 bool IsGFX950) { 2236 // xdl def cycles | gfx940 | gfx950 2237 // 2 pass | 3 4 2238 // 4 pass | 5 6 2239 // 8 pass | 9 10 2240 // 16 pass | 17 18 2241 return NumPasses + 1 + IsGFX950; 2242 } 2243 2244 static int 2245 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, 2246 bool IsGFX950) { 2247 // xdl def cycles | gfx940 | gfx950 2248 // 2 pass | 3 3 2249 // 4 pass | 5 6 2250 // 8 pass | 9 10 2251 // 16 pass | 17 18 2252 return NumPasses + 1 + (NumPasses != 2 && IsGFX950); 2253 } 2254 2255 static int 2256 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2257 // 2 pass -> 2 2258 // 4 pass -> 4 2259 // 8 pass -> 8 2260 // 16 pass -> 16 2261 return NumPasses; 2262 } 2263 2264 static int 2265 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2266 // 2 pass -> 4 2267 // 4 pass -> 6 2268 // 8 pass -> 10 2269 // 16 pass -> 18 2270 return NumPasses + 2; 2271 } 2272 2273 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, 2274 bool IsGFX950) { 2275 // xdl def cycles | gfx942 | gfx950 2276 // 2 pass | 5 5 2277 // 4 pass | 7 8 2278 // 8 pass | 11 12 2279 // 16 pass | 19 20 2280 return NumPasses + 3 + (NumPasses != 2 && IsGFX950); 2281 } 2282 2283 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2284 int WaitStatesNeeded = 0; 2285 unsigned Opc = MI->getOpcode(); 2286 2287 auto IsLegacyVALUFn = [](const MachineInstr &MI) { 2288 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2289 }; 2290 2291 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 2292 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 2293 !SIInstrInfo::isDOT(MI); 2294 }; 2295 2296 if (!SIInstrInfo::isMFMA(*MI)) 2297 return WaitStatesNeeded; 2298 2299 const int VALUWritesExecWaitStates = 4; 2300 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2301 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2302 VALUWritesExecWaitStates); 2303 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2304 2305 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2306 2307 // Loop for both DGEMM and S/HGEMM 2nd instruction. 2308 for (const MachineOperand &Use : MI->explicit_uses()) { 2309 const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2310 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2311 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2312 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2313 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2314 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2315 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2316 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2317 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17; 2318 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2319 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2320 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2321 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2322 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2323 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2324 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19; 2325 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 2326 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2327 const int MaxWaitStates = 19; 2328 2329 if (!Use.isReg()) 2330 continue; 2331 Register Reg = Use.getReg(); 2332 bool FullReg; 2333 const MachineInstr *MI1; 2334 2335 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2336 this](const MachineInstr &MI) { 2337 if (!SIInstrInfo::isMFMA(MI)) 2338 return false; 2339 Register DstReg = MI.getOperand(0).getReg(); 2340 FullReg = (DstReg == Reg); 2341 MI1 = &MI; 2342 return TRI.regsOverlap(DstReg, Reg); 2343 }; 2344 2345 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2346 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2347 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2348 2349 int NumWaitStates = 2350 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2351 if (NumWaitStates == std::numeric_limits<int>::max()) 2352 continue; 2353 2354 int OpNo = Use.getOperandNo(); 2355 unsigned Opc1 = MI1->getOpcode(); 2356 int NeedWaitStates = 0; 2357 if (OpNo == SrcCIdx) { 2358 if (!SIInstrInfo::isDGEMM(Opc) && 2359 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) { 2360 NeedWaitStates = 0; 2361 } else if (FullReg) { 2362 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2363 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2364 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2365 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2366 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 2367 else if (ST.hasGFX940Insts() && 2368 TSchedModel.computeInstrLatency(MI1) == 2) 2369 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2370 } else { 2371 switch (Opc1) { 2372 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2373 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2374 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2375 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2376 if (!TII.isXDL(*MI)) 2377 NeedWaitStates = 2378 ST.hasGFX950Insts() 2379 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates 2380 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2381 break; 2382 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2383 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2384 if (!TII.isXDL(*MI)) 2385 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2386 break; 2387 default: 2388 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2389 if (ST.hasGFX940Insts()) { 2390 if (TII.isXDL(*MI) && !TII.isXDL(*MI1)) 2391 break; 2392 2393 NeedWaitStates = 2394 TII.isXDL(*MI1) 2395 ? (TII.isXDL(*MI) 2396 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates( 2397 NumPasses, ST.hasGFX950Insts()) 2398 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates( 2399 NumPasses, ST.hasGFX950Insts())) 2400 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2401 NumPasses); 2402 break; 2403 } 2404 2405 switch (NumPasses) { 2406 case 2: 2407 NeedWaitStates = 2408 SIInstrInfo::isDGEMM(Opc) 2409 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2410 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2411 break; 2412 case 8: 2413 NeedWaitStates = 2414 SIInstrInfo::isDGEMM(Opc) 2415 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2416 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2417 break; 2418 case 16: 2419 NeedWaitStates = 2420 SIInstrInfo::isDGEMM(Opc) 2421 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2422 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2423 break; 2424 default: 2425 llvm_unreachable("unexpected number of passes"); 2426 } 2427 } 2428 } 2429 } else { 2430 switch (Opc1) { 2431 case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2432 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 2433 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 2434 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2435 NeedWaitStates = 2436 ST.hasGFX950Insts() 2437 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates 2438 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2439 break; 2440 case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2441 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2442 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2443 break; 2444 default: 2445 int NumPasses = TSchedModel.computeInstrLatency(MI1); 2446 2447 if (ST.hasGFX940Insts()) { 2448 NeedWaitStates = 2449 TII.isXDL(*MI1) 2450 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2451 NumPasses, ST.hasGFX950Insts()) 2452 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2453 NumPasses); 2454 break; 2455 } 2456 2457 switch (NumPasses) { 2458 case 2: 2459 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 2460 break; 2461 case 4: 2462 llvm_unreachable("unexpected number of passes for mfma"); 2463 case 8: 2464 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2465 break; 2466 case 16: 2467 default: 2468 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2469 } 2470 } 2471 } 2472 if (WaitStatesNeeded >= NeedWaitStates) 2473 continue; 2474 2475 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2476 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2477 2478 if (WaitStatesNeeded == MaxWaitStates) 2479 break; 2480 } 2481 2482 // Pad neighboring MFMA with noops for better inter-wave performance. 2483 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2484 2485 return WaitStatesNeeded; 2486 } 2487 2488 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2489 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2490 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 2491 return 0; 2492 2493 int WaitStatesNeeded = 0; 2494 2495 auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2496 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 2497 }; 2498 2499 for (const MachineOperand &Op : MI->explicit_uses()) { 2500 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 2501 continue; 2502 2503 Register Reg = Op.getReg(); 2504 2505 const int AccVgprReadLdStWaitStates = 2; 2506 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 2507 const int MaxWaitStates = 2; 2508 2509 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 2510 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 2511 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2512 2513 if (WaitStatesNeeded == MaxWaitStates) 2514 return WaitStatesNeeded; // Early exit. 2515 2516 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2517 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2518 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 2519 return false; 2520 auto IsVALUFn = [](const MachineInstr &MI) { 2521 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 2522 }; 2523 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 2524 std::numeric_limits<int>::max(); 2525 }; 2526 2527 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2528 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 2529 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2530 } 2531 2532 return WaitStatesNeeded; 2533 } 2534 2535 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { 2536 assert(!ST.hasVcmpxPermlaneHazard() && 2537 "this is a different vcmpx+permlane hazard"); 2538 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2539 const SIInstrInfo *TII = ST.getInstrInfo(); 2540 2541 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) { 2542 return isVCmpXWritesExec(*TII, *TRI, MI); 2543 }; 2544 2545 auto IsVALUFn = [](const MachineInstr &MI) { 2546 return SIInstrInfo::isVALU(MI); 2547 }; 2548 2549 const int VCmpXWritesExecWaitStates = 4; 2550 const int VALUWritesVDstWaitStates = 2; 2551 int WaitStatesNeeded = 0; 2552 2553 for (const MachineOperand &Op : MI->explicit_uses()) { 2554 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg())) 2555 continue; 2556 Register Reg = Op.getReg(); 2557 2558 int WaitStatesSinceDef = 2559 VALUWritesVDstWaitStates - 2560 getWaitStatesSinceDef(Reg, IsVALUFn, 2561 /*MaxWaitStates=*/VALUWritesVDstWaitStates); 2562 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef); 2563 if (WaitStatesNeeded >= VALUWritesVDstWaitStates) 2564 break; 2565 } 2566 2567 int VCmpXHazardWaits = 2568 VCmpXWritesExecWaitStates - 2569 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates); 2570 2571 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits); 2572 return WaitStatesNeeded; 2573 } 2574 2575 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2576 // 2 pass -> 4 2577 // 4 pass -> 6 2578 // 8 pass -> 10 2579 // 16 pass -> 18 2580 return NumPasses + 2; 2581 } 2582 2583 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, 2584 bool IsGFX950) { 2585 // xdl def cycles | gfx942 | gfx950 2586 // 2 pass | 5 5 2587 // 4 pass | 7 8 2588 // 8 pass | 11 12 2589 // 16 pass | 19 20 2590 return NumPasses + 3 + (NumPasses != 2 && IsGFX950); 2591 } 2592 2593 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, 2594 bool IsGFX950) { 2595 // xdl def cycles | gfx942 | gfx950 2596 // 2 pass | 5 5 2597 // 4 pass | 7 8 2598 // 8 pass | 11 12 2599 // 16 pass | 19 20 2600 return NumPasses + 3 + (NumPasses != 2 && IsGFX950); 2601 } 2602 2603 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2604 // 2 pass -> 4 2605 // 4 pass -> 6 2606 // 8 pass -> 10 2607 // 16 pass -> 18 2608 return NumPasses + 2; 2609 } 2610 2611 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2612 if (!ST.hasGFX90AInsts()) 2613 return 0; 2614 2615 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2616 return SIInstrInfo::isDGEMM(MI.getOpcode()); 2617 }; 2618 2619 // This is checked in checkMAIHazards90A() 2620 if (SIInstrInfo::isMFMA(*MI)) 2621 return 0; 2622 2623 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2624 2625 int WaitStatesNeeded = 0; 2626 2627 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI); 2628 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2629 bool IsVALU = SIInstrInfo::isVALU(*MI); 2630 2631 const MachineInstr *MFMA = nullptr; 2632 unsigned Reg; 2633 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2634 if (!SIInstrInfo::isMFMA(MI) || 2635 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2636 return false; 2637 MFMA = &MI; 2638 return true; 2639 }; 2640 2641 const MachineInstr *DOT = nullptr; 2642 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2643 if (!SIInstrInfo::isDOT(MI) || 2644 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2645 return false; 2646 DOT = &MI; 2647 return true; 2648 }; 2649 2650 bool DGEMMAfterVALUWrite = false; 2651 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2652 // Found DGEMM on reverse traversal to def. 2653 if (SIInstrInfo::isDGEMM(MI.getOpcode())) 2654 DGEMMAfterVALUWrite = true; 2655 2656 // Only hazard if register is defined by a VALU and a DGEMM is found after 2657 // after the def. 2658 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2659 return false; 2660 2661 return true; 2662 }; 2663 2664 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2665 AMDGPU::OpName::src2); 2666 2667 if (IsMemOrExport || IsVALU) { 2668 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2669 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2670 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2671 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2672 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2673 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2674 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2675 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19; 2676 const int DotWriteSameDotReadSrcAB = 3; 2677 const int DotWriteDifferentVALURead = 3; 2678 const int DMFMABetweenVALUWriteVMEMRead = 2; 2679 const int MaxWaitStates = 19; 2680 2681 for (const MachineOperand &Use : MI->explicit_uses()) { 2682 if (!Use.isReg()) 2683 continue; 2684 Reg = Use.getReg(); 2685 2686 DOT = nullptr; 2687 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2688 MaxWaitStates); 2689 if (DOT) { 2690 int NeedWaitStates = 0; 2691 if (DOT->getOpcode() == MI->getOpcode()) { 2692 if (&Use - &MI->getOperand(0) != SrcCIdx) 2693 NeedWaitStates = DotWriteSameDotReadSrcAB; 2694 } else { 2695 NeedWaitStates = DotWriteDifferentVALURead; 2696 } 2697 2698 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2699 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2700 } 2701 2702 // Workaround for HW data hazard bug observed only in GFX90A. When there 2703 // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2704 // causes the SQ to incorrectly not insert two wait states between the two 2705 // instructions needed to avoid data hazard. 2706 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2707 DGEMMAfterVALUWrite = false; 2708 if (TRI.isVectorRegister(MRI, Reg)) { 2709 int WaitStatesNeededForUse = 2710 DMFMABetweenVALUWriteVMEMRead - 2711 getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2712 DMFMABetweenVALUWriteVMEMRead); 2713 2714 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2715 } 2716 } 2717 2718 MFMA = nullptr; 2719 WaitStatesSinceDef = 2720 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2721 if (!MFMA) 2722 continue; 2723 2724 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2725 int NumPasses = HazardDefLatency; 2726 int NeedWaitStates = MaxWaitStates; 2727 2728 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) { 2729 switch (HazardDefLatency) { 2730 case 4: 2731 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2732 : DMFMA4x4WriteVgprVALUReadWaitStates; 2733 break; 2734 case 8: 2735 case 16: 2736 NeedWaitStates = 2737 IsMemOrExport 2738 ? DMFMA16x16WriteVgprMemExpReadWaitStates 2739 : (ST.hasGFX950Insts() 2740 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates 2741 : DMFMA16x16WriteVgprVALUReadWaitStates); 2742 break; 2743 default: 2744 llvm_unreachable("unexpected dgemm"); 2745 } 2746 } else if (ST.hasGFX940Insts()) { 2747 NeedWaitStates = 2748 TII.isXDL(*MFMA) 2749 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates( 2750 NumPasses, ST.hasGFX950Insts()) 2751 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2752 NumPasses); 2753 } else { 2754 switch (HazardDefLatency) { 2755 case 2: 2756 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2757 break; 2758 case 8: 2759 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2760 break; 2761 case 16: 2762 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2763 break; 2764 default: 2765 llvm_unreachable("unexpected number of passes for mfma"); 2766 } 2767 } 2768 2769 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2770 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2771 2772 if (WaitStatesNeeded == MaxWaitStates) 2773 break; 2774 } 2775 } 2776 2777 unsigned Opc = MI->getOpcode(); 2778 const int DMFMAToFMA64WaitStates = 2; 2779 if ((Opc == AMDGPU::V_FMA_F64_e64 || 2780 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2781 Opc == AMDGPU::V_FMAC_F64_dpp) && 2782 WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2783 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2784 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2785 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2786 } 2787 2788 if (!IsVALU && !IsMemOrExport) 2789 return WaitStatesNeeded; 2790 2791 for (const MachineOperand &Def : MI->defs()) { 2792 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2793 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2794 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2795 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 2796 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2797 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2798 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2799 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2800 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2801 const int DotWriteDifferentVALUWrite = 3; 2802 const int MaxWaitStates = 19; 2803 const int MaxWarWaitStates = 15; 2804 2805 Reg = Def.getReg(); 2806 2807 DOT = nullptr; 2808 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2809 MaxWaitStates); 2810 if (DOT && DOT->getOpcode() != MI->getOpcode()) 2811 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2812 WaitStatesSinceDef); 2813 2814 MFMA = nullptr; 2815 WaitStatesSinceDef = 2816 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2817 if (MFMA) { 2818 int NeedWaitStates = MaxWaitStates; 2819 int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2820 2821 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) { 2822 switch (NumPasses) { 2823 case 4: 2824 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2825 break; 2826 case 8: 2827 case 16: 2828 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2829 break; 2830 default: 2831 llvm_unreachable("unexpected number of cycles for dgemm"); 2832 } 2833 } else if (ST.hasGFX940Insts()) { 2834 NeedWaitStates = 2835 TII.isXDL(*MFMA) 2836 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates( 2837 NumPasses, ST.hasGFX950Insts()) 2838 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2839 } else { 2840 switch (NumPasses) { 2841 case 2: 2842 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2843 break; 2844 case 8: 2845 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2846 break; 2847 case 16: 2848 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2849 break; 2850 default: 2851 llvm_unreachable("Unexpected number of passes for mfma"); 2852 } 2853 } 2854 2855 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2856 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2857 2858 if (WaitStatesNeeded == MaxWaitStates) 2859 break; 2860 } 2861 2862 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 2863 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) || 2864 !MI.readsRegister(Reg, &TRI)) 2865 return false; 2866 2867 if (ST.hasGFX940Insts() && !TII.isXDL(MI)) 2868 return false; 2869 2870 const MachineOperand *SrcC = 2871 TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2872 assert(SrcC); 2873 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2874 return false; 2875 2876 MFMA = &MI; 2877 return true; 2878 }; 2879 2880 MFMA = nullptr; 2881 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2882 MaxWarWaitStates); 2883 if (!MFMA) 2884 continue; 2885 2886 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2887 int NeedWaitStates = MaxWaitStates; 2888 switch (HazardDefLatency) { 2889 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2890 break; 2891 case 4: assert(ST.hasGFX940Insts()); 2892 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 2893 break; 2894 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2895 break; 2896 case 16: [[fallthrough]]; 2897 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2898 break; 2899 } 2900 2901 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2902 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2903 } 2904 2905 return WaitStatesNeeded; 2906 } 2907 2908 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2909 if (!SU->isInstr()) 2910 return false; 2911 2912 const MachineInstr *MAI = nullptr; 2913 2914 auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2915 MAI = nullptr; 2916 if (SIInstrInfo::isMFMA(MI)) 2917 MAI = &MI; 2918 return MAI != nullptr; 2919 }; 2920 2921 MachineInstr *MI = SU->getInstr(); 2922 if (IsMFMAFn(*MI)) { 2923 int W = getWaitStatesSince(IsMFMAFn, 16); 2924 if (MAI) 2925 return W < (int)TSchedModel.computeInstrLatency(MAI); 2926 } 2927 2928 return false; 2929 } 2930 2931 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 2932 // insertion of a new instruction. 2933 static void updateGetPCBundle(MachineInstr *NewMI) { 2934 if (!NewMI->isBundled()) 2935 return; 2936 2937 // Find start of bundle. 2938 auto I = NewMI->getIterator(); 2939 while (I->isBundledWithPred()) 2940 I--; 2941 if (I->isBundle()) 2942 I++; 2943 2944 // Bail if this is not an S_GETPC bundle. 2945 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 2946 return; 2947 2948 // Update offsets of any references in the bundle. 2949 const unsigned NewBytes = 4; 2950 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 2951 "Unexpected instruction insertion in bundle"); 2952 auto NextMI = std::next(NewMI->getIterator()); 2953 auto End = NewMI->getParent()->end(); 2954 while (NextMI != End && NextMI->isBundledWithPred()) { 2955 for (auto &Operand : NextMI->operands()) { 2956 if (Operand.isGlobal()) 2957 Operand.setOffset(Operand.getOffset() + NewBytes); 2958 } 2959 NextMI++; 2960 } 2961 } 2962 2963 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2964 if (!ST.hasVALUMaskWriteHazard()) 2965 return false; 2966 assert(!ST.hasExtendedWaitCounts()); 2967 2968 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2969 return false; 2970 2971 // The hazard sequence is three instructions: 2972 // 1. VALU reads SGPR as mask 2973 // 2. SALU writes SGPR 2974 // 3. SALU reads SGPR 2975 // The hazard can expire if the distance between 2 and 3 is sufficient. 2976 // In practice this happens <10% of the time, hence this always assumes 2977 // the hazard exists if 1 and 2 are present to avoid searching. 2978 2979 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2980 if (!SDSTOp || !SDSTOp->isReg()) 2981 return false; 2982 2983 const Register HazardReg = SDSTOp->getReg(); 2984 if (HazardReg == AMDGPU::EXEC || 2985 HazardReg == AMDGPU::EXEC_LO || 2986 HazardReg == AMDGPU::EXEC_HI || 2987 HazardReg == AMDGPU::M0) 2988 return false; 2989 2990 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2991 switch (I.getOpcode()) { 2992 case AMDGPU::V_ADDC_U32_e32: 2993 case AMDGPU::V_ADDC_U32_dpp: 2994 case AMDGPU::V_CNDMASK_B16_t16_e32: 2995 case AMDGPU::V_CNDMASK_B16_fake16_e32: 2996 case AMDGPU::V_CNDMASK_B16_t16_dpp: 2997 case AMDGPU::V_CNDMASK_B16_fake16_dpp: 2998 case AMDGPU::V_CNDMASK_B32_e32: 2999 case AMDGPU::V_CNDMASK_B32_dpp: 3000 case AMDGPU::V_DIV_FMAS_F32_e64: 3001 case AMDGPU::V_DIV_FMAS_F64_e64: 3002 case AMDGPU::V_SUBB_U32_e32: 3003 case AMDGPU::V_SUBB_U32_dpp: 3004 case AMDGPU::V_SUBBREV_U32_e32: 3005 case AMDGPU::V_SUBBREV_U32_dpp: 3006 // These implicitly read VCC as mask source. 3007 return HazardReg == AMDGPU::VCC || 3008 HazardReg == AMDGPU::VCC_LO || 3009 HazardReg == AMDGPU::VCC_HI; 3010 case AMDGPU::V_ADDC_U32_e64: 3011 case AMDGPU::V_ADDC_U32_e64_dpp: 3012 case AMDGPU::V_CNDMASK_B16_t16_e64: 3013 case AMDGPU::V_CNDMASK_B16_fake16_e64: 3014 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp: 3015 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp: 3016 case AMDGPU::V_CNDMASK_B32_e64: 3017 case AMDGPU::V_CNDMASK_B32_e64_dpp: 3018 case AMDGPU::V_SUBB_U32_e64: 3019 case AMDGPU::V_SUBB_U32_e64_dpp: 3020 case AMDGPU::V_SUBBREV_U32_e64: 3021 case AMDGPU::V_SUBBREV_U32_e64_dpp: { 3022 // Only check mask register overlaps. 3023 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 3024 assert(SSRCOp); 3025 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 3026 } 3027 default: 3028 return false; 3029 } 3030 }; 3031 3032 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3033 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 3034 // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 3035 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 3036 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 3037 return true; 3038 3039 // VALU access to any SGPR or literal constant other than HazardReg 3040 // mitigates hazard. No need to check HazardReg here as this will 3041 // only be called when !IsHazardFn. 3042 if (!SIInstrInfo::isVALU(I)) 3043 return false; 3044 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 3045 const MachineOperand &Op = I.getOperand(OpNo); 3046 if (Op.isReg()) { 3047 Register OpReg = Op.getReg(); 3048 // Only consider uses 3049 if (!Op.isUse()) 3050 continue; 3051 // Ignore EXEC 3052 if (OpReg == AMDGPU::EXEC || 3053 OpReg == AMDGPU::EXEC_LO || 3054 OpReg == AMDGPU::EXEC_HI) 3055 continue; 3056 // Ignore all implicit uses except VCC 3057 if (Op.isImplicit()) { 3058 if (OpReg == AMDGPU::VCC || 3059 OpReg == AMDGPU::VCC_LO || 3060 OpReg == AMDGPU::VCC_HI) 3061 return true; 3062 continue; 3063 } 3064 if (TRI.isSGPRReg(MRI, OpReg)) 3065 return true; 3066 } else { 3067 const MCInstrDesc &InstDesc = I.getDesc(); 3068 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3069 if (!TII.isInlineConstant(Op, OpInfo)) 3070 return true; 3071 } 3072 } 3073 return false; 3074 }; 3075 3076 // Check for hazard 3077 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 3078 std::numeric_limits<int>::max()) 3079 return false; 3080 3081 auto NextMI = std::next(MI->getIterator()); 3082 3083 // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 3084 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 3085 TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 3086 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 3087 3088 // SALU write may be s_getpc in a bundle. 3089 updateGetPCBundle(NewMI); 3090 3091 return true; 3092 } 3093 3094 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 3095 const SIInstrInfo &TII) { 3096 MachineBasicBlock &EntryMBB = MF->front(); 3097 if (EntryMBB.begin() != EntryMBB.end()) { 3098 auto &EntryMI = *EntryMBB.begin(); 3099 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 3100 EntryMI.getOperand(0).getImm() >= Priority) 3101 return false; 3102 } 3103 3104 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 3105 .addImm(Priority); 3106 return true; 3107 } 3108 3109 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 3110 if (!ST.hasRequiredExportPriority()) 3111 return false; 3112 3113 // Assume the following shader types will never have exports, 3114 // and avoid adding or adjusting S_SETPRIO. 3115 MachineBasicBlock *MBB = MI->getParent(); 3116 MachineFunction *MF = MBB->getParent(); 3117 auto CC = MF->getFunction().getCallingConv(); 3118 switch (CC) { 3119 case CallingConv::AMDGPU_CS: 3120 case CallingConv::AMDGPU_CS_Chain: 3121 case CallingConv::AMDGPU_CS_ChainPreserve: 3122 case CallingConv::AMDGPU_KERNEL: 3123 return false; 3124 default: 3125 break; 3126 } 3127 3128 const int MaxPriority = 3; 3129 const int NormalPriority = 2; 3130 const int PostExportPriority = 0; 3131 3132 auto It = MI->getIterator(); 3133 switch (MI->getOpcode()) { 3134 case AMDGPU::S_ENDPGM: 3135 case AMDGPU::S_ENDPGM_SAVED: 3136 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 3137 case AMDGPU::SI_RETURN_TO_EPILOG: 3138 // Ensure shader with calls raises priority at entry. 3139 // This ensures correct priority if exports exist in callee. 3140 if (MF->getFrameInfo().hasCalls()) 3141 return ensureEntrySetPrio(MF, NormalPriority, TII); 3142 return false; 3143 case AMDGPU::S_SETPRIO: { 3144 // Raise minimum priority unless in workaround. 3145 auto &PrioOp = MI->getOperand(0); 3146 int Prio = PrioOp.getImm(); 3147 bool InWA = (Prio == PostExportPriority) && 3148 (It != MBB->begin() && TII.isEXP(*std::prev(It))); 3149 if (InWA || Prio >= NormalPriority) 3150 return false; 3151 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 3152 return true; 3153 } 3154 default: 3155 if (!TII.isEXP(*MI)) 3156 return false; 3157 break; 3158 } 3159 3160 // Check entry priority at each export (as there will only be a few). 3161 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 3162 bool Changed = false; 3163 if (CC != CallingConv::AMDGPU_Gfx) 3164 Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 3165 3166 auto NextMI = std::next(It); 3167 bool EndOfShader = false; 3168 if (NextMI != MBB->end()) { 3169 // Only need WA at end of sequence of exports. 3170 if (TII.isEXP(*NextMI)) 3171 return Changed; 3172 // Assume appropriate S_SETPRIO after export means WA already applied. 3173 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 3174 NextMI->getOperand(0).getImm() == PostExportPriority) 3175 return Changed; 3176 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 3177 } 3178 3179 const DebugLoc &DL = MI->getDebugLoc(); 3180 3181 // Lower priority. 3182 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3183 .addImm(PostExportPriority); 3184 3185 if (!EndOfShader) { 3186 // Wait for exports to complete. 3187 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 3188 .addReg(AMDGPU::SGPR_NULL) 3189 .addImm(0); 3190 } 3191 3192 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3193 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3194 3195 if (!EndOfShader) { 3196 // Return to normal (higher) priority. 3197 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3198 .addImm(NormalPriority); 3199 } 3200 3201 return true; 3202 } 3203