1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "GCNSubtarget.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "llvm/CodeGen/MachineFunction.h" 17 #include "llvm/CodeGen/ScheduleDAG.h" 18 #include "llvm/Support/TargetParser.h" 19 20 using namespace llvm; 21 22 //===----------------------------------------------------------------------===// 23 // Hazard Recoginizer Implementation 24 //===----------------------------------------------------------------------===// 25 26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 27 IsHazardRecognizerMode(false), 28 CurrCycleInstr(nullptr), 29 MF(MF), 30 ST(MF.getSubtarget<GCNSubtarget>()), 31 TII(*ST.getInstrInfo()), 32 TRI(TII.getRegisterInfo()), 33 ClauseUses(TRI.getNumRegUnits()), 34 ClauseDefs(TRI.getNumRegUnits()) { 35 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 36 TSchedModel.init(&ST); 37 } 38 39 void GCNHazardRecognizer::Reset() { 40 EmittedInstrs.clear(); 41 } 42 43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 44 EmitInstruction(SU->getInstr()); 45 } 46 47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 48 CurrCycleInstr = MI; 49 } 50 51 static bool isDivFMas(unsigned Opcode) { 52 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 53 } 54 55 static bool isSGetReg(unsigned Opcode) { 56 return Opcode == AMDGPU::S_GETREG_B32; 57 } 58 59 static bool isSSetReg(unsigned Opcode) { 60 switch (Opcode) { 61 case AMDGPU::S_SETREG_B32: 62 case AMDGPU::S_SETREG_B32_mode: 63 case AMDGPU::S_SETREG_IMM32_B32: 64 case AMDGPU::S_SETREG_IMM32_B32_mode: 65 return true; 66 } 67 return false; 68 } 69 70 static bool isRWLane(unsigned Opcode) { 71 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 72 } 73 74 static bool isRFE(unsigned Opcode) { 75 return Opcode == AMDGPU::S_RFE_B64; 76 } 77 78 static bool isSMovRel(unsigned Opcode) { 79 switch (Opcode) { 80 case AMDGPU::S_MOVRELS_B32: 81 case AMDGPU::S_MOVRELS_B64: 82 case AMDGPU::S_MOVRELD_B32: 83 case AMDGPU::S_MOVRELD_B64: 84 return true; 85 default: 86 return false; 87 } 88 } 89 90 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 91 const MachineInstr &MI) { 92 if (TII.isAlwaysGDS(MI.getOpcode())) 93 return true; 94 95 switch (MI.getOpcode()) { 96 case AMDGPU::S_SENDMSG: 97 case AMDGPU::S_SENDMSGHALT: 98 case AMDGPU::S_TTRACEDATA: 99 return true; 100 // These DS opcodes don't support GDS. 101 case AMDGPU::DS_NOP: 102 case AMDGPU::DS_PERMUTE_B32: 103 case AMDGPU::DS_BPERMUTE_B32: 104 return false; 105 default: 106 if (TII.isDS(MI.getOpcode())) { 107 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 108 AMDGPU::OpName::gds); 109 if (MI.getOperand(GDS).getImm()) 110 return true; 111 } 112 return false; 113 } 114 } 115 116 static bool isPermlane(const MachineInstr &MI) { 117 unsigned Opcode = MI.getOpcode(); 118 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 119 Opcode == AMDGPU::V_PERMLANEX16_B32_e64; 120 } 121 122 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 123 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 124 AMDGPU::OpName::simm16); 125 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 126 } 127 128 ScheduleHazardRecognizer::HazardType 129 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 130 MachineInstr *MI = SU->getInstr(); 131 // If we are not in "HazardRecognizerMode" and therefore not being run from 132 // the scheduler, track possible stalls from hazards but don't insert noops. 133 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 134 135 if (MI->isBundle()) 136 return NoHazard; 137 138 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 139 return HazardType; 140 141 // FIXME: Should flat be considered vmem? 142 if ((SIInstrInfo::isVMEM(*MI) || 143 SIInstrInfo::isFLAT(*MI)) 144 && checkVMEMHazards(MI) > 0) 145 return HazardType; 146 147 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 148 return HazardType; 149 150 if (checkFPAtomicToDenormModeHazard(MI) > 0) 151 return HazardType; 152 153 if (ST.hasNoDataDepHazard()) 154 return NoHazard; 155 156 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 157 return HazardType; 158 159 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 160 return HazardType; 161 162 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 163 return HazardType; 164 165 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 166 return HazardType; 167 168 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 169 return HazardType; 170 171 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 172 return HazardType; 173 174 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 175 return HazardType; 176 177 if (ST.hasReadM0MovRelInterpHazard() && 178 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 179 checkReadM0Hazards(MI) > 0) 180 return HazardType; 181 182 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 183 checkReadM0Hazards(MI) > 0) 184 return HazardType; 185 186 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 187 return HazardType; 188 189 if ((SIInstrInfo::isVMEM(*MI) || 190 SIInstrInfo::isFLAT(*MI) || 191 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 192 return HazardType; 193 194 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 195 return HazardType; 196 197 return NoHazard; 198 } 199 200 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 201 unsigned Quantity) { 202 while (Quantity > 0) { 203 unsigned Arg = std::min(Quantity, 8u); 204 Quantity -= Arg; 205 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 206 .addImm(Arg - 1); 207 } 208 } 209 210 void GCNHazardRecognizer::processBundle() { 211 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 212 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 213 // Check bundled MachineInstr's for hazards. 214 for (; MI != E && MI->isInsideBundle(); ++MI) { 215 CurrCycleInstr = &*MI; 216 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 217 218 if (IsHazardRecognizerMode) { 219 fixHazards(CurrCycleInstr); 220 221 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 222 } 223 224 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 225 // include the bundled MI directly after, only add a maximum of 226 // (MaxLookAhead - 1) noops to EmittedInstrs. 227 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 228 EmittedInstrs.push_front(nullptr); 229 230 EmittedInstrs.push_front(CurrCycleInstr); 231 EmittedInstrs.resize(MaxLookAhead); 232 } 233 CurrCycleInstr = nullptr; 234 } 235 236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 237 IsHazardRecognizerMode = true; 238 CurrCycleInstr = MI; 239 unsigned W = PreEmitNoopsCommon(MI); 240 fixHazards(MI); 241 CurrCycleInstr = nullptr; 242 return W; 243 } 244 245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 246 if (MI->isBundle()) 247 return 0; 248 249 int WaitStates = 0; 250 251 if (SIInstrInfo::isSMRD(*MI)) 252 return std::max(WaitStates, checkSMRDHazards(MI)); 253 254 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 255 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 256 257 if (ST.hasNSAtoVMEMBug()) 258 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 259 260 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 261 262 if (ST.hasNoDataDepHazard()) 263 return WaitStates; 264 265 if (SIInstrInfo::isVALU(*MI)) 266 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 267 268 if (SIInstrInfo::isDPP(*MI)) 269 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 270 271 if (isDivFMas(MI->getOpcode())) 272 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 273 274 if (isRWLane(MI->getOpcode())) 275 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 276 277 if (MI->isInlineAsm()) 278 return std::max(WaitStates, checkInlineAsmHazards(MI)); 279 280 if (isSGetReg(MI->getOpcode())) 281 return std::max(WaitStates, checkGetRegHazards(MI)); 282 283 if (isSSetReg(MI->getOpcode())) 284 return std::max(WaitStates, checkSetRegHazards(MI)); 285 286 if (isRFE(MI->getOpcode())) 287 return std::max(WaitStates, checkRFEHazards(MI)); 288 289 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 290 isSMovRel(MI->getOpcode()))) 291 return std::max(WaitStates, checkReadM0Hazards(MI)); 292 293 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 294 return std::max(WaitStates, checkReadM0Hazards(MI)); 295 296 if (SIInstrInfo::isMAI(*MI)) 297 return std::max(WaitStates, checkMAIHazards(MI)); 298 299 if (SIInstrInfo::isVMEM(*MI) || 300 SIInstrInfo::isFLAT(*MI) || 301 SIInstrInfo::isDS(*MI)) 302 return std::max(WaitStates, checkMAILdStHazards(MI)); 303 304 return WaitStates; 305 } 306 307 void GCNHazardRecognizer::EmitNoop() { 308 EmittedInstrs.push_front(nullptr); 309 } 310 311 void GCNHazardRecognizer::AdvanceCycle() { 312 // When the scheduler detects a stall, it will call AdvanceCycle() without 313 // emitting any instructions. 314 if (!CurrCycleInstr) { 315 EmittedInstrs.push_front(nullptr); 316 return; 317 } 318 319 // Do not track non-instructions which do not affect the wait states. 320 // If included, these instructions can lead to buffer overflow such that 321 // detectable hazards are missed. 322 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 323 CurrCycleInstr->isKill()) { 324 CurrCycleInstr = nullptr; 325 return; 326 } 327 328 if (CurrCycleInstr->isBundle()) { 329 processBundle(); 330 return; 331 } 332 333 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 334 335 // Keep track of emitted instructions 336 EmittedInstrs.push_front(CurrCycleInstr); 337 338 // Add a nullptr for each additional wait state after the first. Make sure 339 // not to add more than getMaxLookAhead() items to the list, since we 340 // truncate the list to that size right after this loop. 341 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 342 i < e; ++i) { 343 EmittedInstrs.push_front(nullptr); 344 } 345 346 // getMaxLookahead() is the largest number of wait states we will ever need 347 // to insert, so there is no point in keeping track of more than that many 348 // wait states. 349 EmittedInstrs.resize(getMaxLookAhead()); 350 351 CurrCycleInstr = nullptr; 352 } 353 354 void GCNHazardRecognizer::RecedeCycle() { 355 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 356 } 357 358 //===----------------------------------------------------------------------===// 359 // Helper Functions 360 //===----------------------------------------------------------------------===// 361 362 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 363 364 // Returns a minimum wait states since \p I walking all predecessors. 365 // Only scans until \p IsExpired does not return true. 366 // Can only be run in a hazard recognizer mode. 367 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 368 MachineBasicBlock *MBB, 369 MachineBasicBlock::reverse_instr_iterator I, 370 int WaitStates, 371 IsExpiredFn IsExpired, 372 DenseSet<const MachineBasicBlock *> &Visited) { 373 for (auto E = MBB->instr_rend(); I != E; ++I) { 374 // Don't add WaitStates for parent BUNDLE instructions. 375 if (I->isBundle()) 376 continue; 377 378 if (IsHazard(&*I)) 379 return WaitStates; 380 381 if (I->isInlineAsm() || I->isMetaInstruction()) 382 continue; 383 384 WaitStates += SIInstrInfo::getNumWaitStates(*I); 385 386 if (IsExpired(&*I, WaitStates)) 387 return std::numeric_limits<int>::max(); 388 } 389 390 int MinWaitStates = WaitStates; 391 bool Found = false; 392 for (MachineBasicBlock *Pred : MBB->predecessors()) { 393 if (!Visited.insert(Pred).second) 394 continue; 395 396 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 397 WaitStates, IsExpired, Visited); 398 399 if (W == std::numeric_limits<int>::max()) 400 continue; 401 402 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 403 if (IsExpired(nullptr, MinWaitStates)) 404 return MinWaitStates; 405 406 Found = true; 407 } 408 409 if (Found) 410 return MinWaitStates; 411 412 return std::numeric_limits<int>::max(); 413 } 414 415 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 416 MachineInstr *MI, 417 IsExpiredFn IsExpired) { 418 DenseSet<const MachineBasicBlock *> Visited; 419 return getWaitStatesSince(IsHazard, MI->getParent(), 420 std::next(MI->getReverseIterator()), 421 0, IsExpired, Visited); 422 } 423 424 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 425 if (IsHazardRecognizerMode) { 426 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 427 return WaitStates >= Limit; 428 }; 429 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 430 } 431 432 int WaitStates = 0; 433 for (MachineInstr *MI : EmittedInstrs) { 434 if (MI) { 435 if (IsHazard(MI)) 436 return WaitStates; 437 438 if (MI->isInlineAsm()) 439 continue; 440 } 441 ++WaitStates; 442 443 if (WaitStates >= Limit) 444 break; 445 } 446 return std::numeric_limits<int>::max(); 447 } 448 449 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 450 IsHazardFn IsHazardDef, 451 int Limit) { 452 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 453 454 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 455 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 456 }; 457 458 return getWaitStatesSince(IsHazardFn, Limit); 459 } 460 461 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 462 int Limit) { 463 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 464 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 465 }; 466 467 return getWaitStatesSince(IsHazardFn, Limit); 468 } 469 470 //===----------------------------------------------------------------------===// 471 // No-op Hazard Detection 472 //===----------------------------------------------------------------------===// 473 474 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 475 MCRegister Reg) { 476 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 477 BV.set(*RUI); 478 } 479 480 static void addRegsToSet(const SIRegisterInfo &TRI, 481 iterator_range<MachineInstr::const_mop_iterator> Ops, 482 BitVector &Set) { 483 for (const MachineOperand &Op : Ops) { 484 if (Op.isReg()) 485 addRegUnits(TRI, Set, Op.getReg().asMCReg()); 486 } 487 } 488 489 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 490 // XXX: Do we need to worry about implicit operands 491 addRegsToSet(TRI, MI.defs(), ClauseDefs); 492 addRegsToSet(TRI, MI.uses(), ClauseUses); 493 } 494 495 static bool breaksSMEMSoftClause(MachineInstr *MI) { 496 return !SIInstrInfo::isSMRD(*MI); 497 } 498 499 static bool breaksVMEMSoftClause(MachineInstr *MI) { 500 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 501 } 502 503 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 504 // SMEM soft clause are only present on VI+, and only matter if xnack is 505 // enabled. 506 if (!ST.isXNACKEnabled()) 507 return 0; 508 509 bool IsSMRD = TII.isSMRD(*MEM); 510 511 resetClause(); 512 513 // A soft-clause is any group of consecutive SMEM instructions. The 514 // instructions in this group may return out of order and/or may be 515 // replayed (i.e. the same instruction issued more than once). 516 // 517 // In order to handle these situations correctly we need to make sure that 518 // when a clause has more than one instruction, no instruction in the clause 519 // writes to a register that is read by another instruction in the clause 520 // (including itself). If we encounter this situaion, we need to break the 521 // clause by inserting a non SMEM instruction. 522 523 for (MachineInstr *MI : EmittedInstrs) { 524 // When we hit a non-SMEM instruction then we have passed the start of the 525 // clause and we can stop. 526 if (!MI) 527 break; 528 529 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 530 break; 531 532 addClauseInst(*MI); 533 } 534 535 if (ClauseDefs.none()) 536 return 0; 537 538 // We need to make sure not to put loads and stores in the same clause if they 539 // use the same address. For now, just start a new clause whenever we see a 540 // store. 541 if (MEM->mayStore()) 542 return 1; 543 544 addClauseInst(*MEM); 545 546 // If the set of defs and uses intersect then we cannot add this instruction 547 // to the clause, so we have a hazard. 548 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 549 } 550 551 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 552 int WaitStatesNeeded = 0; 553 554 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 555 556 // This SMRD hazard only affects SI. 557 if (!ST.hasSMRDReadVALUDefHazard()) 558 return WaitStatesNeeded; 559 560 // A read of an SGPR by SMRD instruction requires 4 wait states when the 561 // SGPR was written by a VALU instruction. 562 int SmrdSgprWaitStates = 4; 563 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 564 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 565 566 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 567 568 for (const MachineOperand &Use : SMRD->uses()) { 569 if (!Use.isReg()) 570 continue; 571 int WaitStatesNeededForUse = 572 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 573 SmrdSgprWaitStates); 574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 575 576 // This fixes what appears to be undocumented hardware behavior in SI where 577 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 578 // needs some number of nops in between. We don't know how many we need, but 579 // let's use 4. This wasn't discovered before probably because the only 580 // case when this happens is when we expand a 64-bit pointer into a full 581 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 582 // probably never encountered in the closed-source land. 583 if (IsBufferSMRD) { 584 int WaitStatesNeededForUse = 585 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 586 IsBufferHazardDefFn, 587 SmrdSgprWaitStates); 588 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 589 } 590 } 591 592 return WaitStatesNeeded; 593 } 594 595 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 596 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 597 return 0; 598 599 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 600 601 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 602 // SGPR was written by a VALU Instruction. 603 const int VmemSgprWaitStates = 5; 604 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 605 for (const MachineOperand &Use : VMEM->uses()) { 606 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 607 continue; 608 609 int WaitStatesNeededForUse = 610 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 611 VmemSgprWaitStates); 612 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 613 } 614 return WaitStatesNeeded; 615 } 616 617 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 618 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 619 const SIInstrInfo *TII = ST.getInstrInfo(); 620 621 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 622 int DppVgprWaitStates = 2; 623 int DppExecWaitStates = 5; 624 int WaitStatesNeeded = 0; 625 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 626 627 for (const MachineOperand &Use : DPP->uses()) { 628 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 629 continue; 630 int WaitStatesNeededForUse = 631 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 632 [](MachineInstr *) { return true; }, 633 DppVgprWaitStates); 634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 635 } 636 637 WaitStatesNeeded = std::max( 638 WaitStatesNeeded, 639 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 640 DppExecWaitStates)); 641 642 return WaitStatesNeeded; 643 } 644 645 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 646 const SIInstrInfo *TII = ST.getInstrInfo(); 647 648 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 649 // instruction. 650 const int DivFMasWaitStates = 4; 651 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 652 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 653 DivFMasWaitStates); 654 655 return DivFMasWaitStates - WaitStatesNeeded; 656 } 657 658 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 659 const SIInstrInfo *TII = ST.getInstrInfo(); 660 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 661 662 const int GetRegWaitStates = 2; 663 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 664 return GetRegHWReg == getHWReg(TII, *MI); 665 }; 666 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 667 668 return GetRegWaitStates - WaitStatesNeeded; 669 } 670 671 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 672 const SIInstrInfo *TII = ST.getInstrInfo(); 673 unsigned HWReg = getHWReg(TII, *SetRegInstr); 674 675 const int SetRegWaitStates = ST.getSetRegWaitStates(); 676 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 677 return HWReg == getHWReg(TII, *MI); 678 }; 679 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 680 return SetRegWaitStates - WaitStatesNeeded; 681 } 682 683 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 684 if (!MI.mayStore()) 685 return -1; 686 687 const SIInstrInfo *TII = ST.getInstrInfo(); 688 unsigned Opcode = MI.getOpcode(); 689 const MCInstrDesc &Desc = MI.getDesc(); 690 691 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 692 int VDataRCID = -1; 693 if (VDataIdx != -1) 694 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 695 696 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 697 // There is no hazard if the instruction does not use vector regs 698 // (like wbinvl1) 699 if (VDataIdx == -1) 700 return -1; 701 // For MUBUF/MTBUF instructions this hazard only exists if the 702 // instruction is not using a register in the soffset field. 703 const MachineOperand *SOffset = 704 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 705 // If we have no soffset operand, then assume this field has been 706 // hardcoded to zero. 707 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 708 (!SOffset || !SOffset->isReg())) 709 return VDataIdx; 710 } 711 712 // MIMG instructions create a hazard if they don't use a 256-bit T# and 713 // the store size is greater than 8 bytes and they have more than two bits 714 // of their dmask set. 715 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 716 if (TII->isMIMG(MI)) { 717 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 718 assert(SRsrcIdx != -1 && 719 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 720 (void)SRsrcIdx; 721 } 722 723 if (TII->isFLAT(MI)) { 724 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 725 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 726 return DataIdx; 727 } 728 729 return -1; 730 } 731 732 int 733 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 734 const MachineRegisterInfo &MRI) { 735 // Helper to check for the hazard where VMEM instructions that store more than 736 // 8 bytes can have there store data over written by the next instruction. 737 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 738 739 const int VALUWaitStates = 1; 740 int WaitStatesNeeded = 0; 741 742 if (!TRI->isVGPR(MRI, Def.getReg())) 743 return WaitStatesNeeded; 744 Register Reg = Def.getReg(); 745 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 746 int DataIdx = createsVALUHazard(*MI); 747 return DataIdx >= 0 && 748 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 749 }; 750 int WaitStatesNeededForDef = 751 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 752 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 753 754 return WaitStatesNeeded; 755 } 756 757 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 758 // This checks for the hazard where VMEM instructions that store more than 759 // 8 bytes can have there store data over written by the next instruction. 760 if (!ST.has12DWordStoreHazard()) 761 return 0; 762 763 const MachineRegisterInfo &MRI = MF.getRegInfo(); 764 int WaitStatesNeeded = 0; 765 766 for (const MachineOperand &Def : VALU->defs()) { 767 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 768 } 769 770 return WaitStatesNeeded; 771 } 772 773 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 774 // This checks for hazards associated with inline asm statements. 775 // Since inline asms can contain just about anything, we use this 776 // to call/leverage other check*Hazard routines. Note that 777 // this function doesn't attempt to address all possible inline asm 778 // hazards (good luck), but is a collection of what has been 779 // problematic thus far. 780 781 // see checkVALUHazards() 782 if (!ST.has12DWordStoreHazard()) 783 return 0; 784 785 const MachineRegisterInfo &MRI = MF.getRegInfo(); 786 int WaitStatesNeeded = 0; 787 788 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 789 I != E; ++I) { 790 const MachineOperand &Op = IA->getOperand(I); 791 if (Op.isReg() && Op.isDef()) { 792 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 793 } 794 } 795 796 return WaitStatesNeeded; 797 } 798 799 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 800 const SIInstrInfo *TII = ST.getInstrInfo(); 801 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 802 const MachineRegisterInfo &MRI = MF.getRegInfo(); 803 804 const MachineOperand *LaneSelectOp = 805 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 806 807 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 808 return 0; 809 810 Register LaneSelectReg = LaneSelectOp->getReg(); 811 auto IsHazardFn = [TII] (MachineInstr *MI) { 812 return TII->isVALU(*MI); 813 }; 814 815 const int RWLaneWaitStates = 4; 816 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 817 RWLaneWaitStates); 818 return RWLaneWaitStates - WaitStatesSince; 819 } 820 821 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 822 if (!ST.hasRFEHazards()) 823 return 0; 824 825 const SIInstrInfo *TII = ST.getInstrInfo(); 826 827 const int RFEWaitStates = 1; 828 829 auto IsHazardFn = [TII] (MachineInstr *MI) { 830 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 831 }; 832 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 833 return RFEWaitStates - WaitStatesNeeded; 834 } 835 836 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 837 const SIInstrInfo *TII = ST.getInstrInfo(); 838 const int SMovRelWaitStates = 1; 839 auto IsHazardFn = [TII] (MachineInstr *MI) { 840 return TII->isSALU(*MI); 841 }; 842 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 843 SMovRelWaitStates); 844 } 845 846 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 847 fixVMEMtoScalarWriteHazards(MI); 848 fixVcmpxPermlaneHazards(MI); 849 fixSMEMtoVectorWriteHazards(MI); 850 fixVcmpxExecWARHazard(MI); 851 fixLdsBranchVmemWARHazard(MI); 852 } 853 854 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 855 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 856 return false; 857 858 const SIInstrInfo *TII = ST.getInstrInfo(); 859 auto IsHazardFn = [TII] (MachineInstr *MI) { 860 return TII->isVOPC(*MI); 861 }; 862 863 auto IsExpiredFn = [] (MachineInstr *MI, int) { 864 if (!MI) 865 return false; 866 unsigned Opc = MI->getOpcode(); 867 return SIInstrInfo::isVALU(*MI) && 868 Opc != AMDGPU::V_NOP_e32 && 869 Opc != AMDGPU::V_NOP_e64 && 870 Opc != AMDGPU::V_NOP_sdwa; 871 }; 872 873 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 874 std::numeric_limits<int>::max()) 875 return false; 876 877 // V_NOP will be discarded by SQ. 878 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 879 // which is always a VGPR and available. 880 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 881 Register Reg = Src0->getReg(); 882 bool IsUndef = Src0->isUndef(); 883 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 884 TII->get(AMDGPU::V_MOV_B32_e32)) 885 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 886 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 887 888 return true; 889 } 890 891 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 892 if (!ST.hasVMEMtoScalarWriteHazard()) 893 return false; 894 895 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 896 return false; 897 898 if (MI->getNumDefs() == 0) 899 return false; 900 901 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 902 903 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 904 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 905 !SIInstrInfo::isFLAT(*I)) 906 return false; 907 908 for (const MachineOperand &Def : MI->defs()) { 909 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 910 if (!Op) 911 continue; 912 return true; 913 } 914 return false; 915 }; 916 917 auto IsExpiredFn = [](MachineInstr *MI, int) { 918 return MI && (SIInstrInfo::isVALU(*MI) || 919 (MI->getOpcode() == AMDGPU::S_WAITCNT && 920 !MI->getOperand(0).getImm()) || 921 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 922 MI->getOperand(0).getImm() == 0xffe3)); 923 }; 924 925 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 926 std::numeric_limits<int>::max()) 927 return false; 928 929 const SIInstrInfo *TII = ST.getInstrInfo(); 930 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 931 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 932 .addImm(0xffe3); 933 return true; 934 } 935 936 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 937 if (!ST.hasSMEMtoVectorWriteHazard()) 938 return false; 939 940 if (!SIInstrInfo::isVALU(*MI)) 941 return false; 942 943 unsigned SDSTName; 944 switch (MI->getOpcode()) { 945 case AMDGPU::V_READLANE_B32: 946 case AMDGPU::V_READFIRSTLANE_B32: 947 SDSTName = AMDGPU::OpName::vdst; 948 break; 949 default: 950 SDSTName = AMDGPU::OpName::sdst; 951 break; 952 } 953 954 const SIInstrInfo *TII = ST.getInstrInfo(); 955 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 956 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 957 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 958 if (!SDST) { 959 for (const auto &MO : MI->implicit_operands()) { 960 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 961 SDST = &MO; 962 break; 963 } 964 } 965 } 966 967 if (!SDST) 968 return false; 969 970 const Register SDSTReg = SDST->getReg(); 971 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 972 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 973 }; 974 975 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 976 if (MI) { 977 if (TII->isSALU(*MI)) { 978 switch (MI->getOpcode()) { 979 case AMDGPU::S_SETVSKIP: 980 case AMDGPU::S_VERSION: 981 case AMDGPU::S_WAITCNT_VSCNT: 982 case AMDGPU::S_WAITCNT_VMCNT: 983 case AMDGPU::S_WAITCNT_EXPCNT: 984 // These instructions cannot not mitigate the hazard. 985 return false; 986 case AMDGPU::S_WAITCNT_LGKMCNT: 987 // Reducing lgkmcnt count to 0 always mitigates the hazard. 988 return (MI->getOperand(1).getImm() == 0) && 989 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 990 case AMDGPU::S_WAITCNT: { 991 const int64_t Imm = MI->getOperand(0).getImm(); 992 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 993 return (Decoded.LgkmCnt == 0); 994 } 995 default: 996 // SOPP instructions cannot mitigate the hazard. 997 if (TII->isSOPP(*MI)) 998 return false; 999 // At this point the SALU can be assumed to mitigate the hazard 1000 // because either: 1001 // (a) it is independent of the at risk SMEM (breaking chain), 1002 // or 1003 // (b) it is dependent on the SMEM, in which case an appropriate 1004 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1005 // SMEM instruction. 1006 return true; 1007 } 1008 } 1009 } 1010 return false; 1011 }; 1012 1013 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1014 std::numeric_limits<int>::max()) 1015 return false; 1016 1017 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1018 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1019 .addImm(0); 1020 return true; 1021 } 1022 1023 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1024 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1025 return false; 1026 1027 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1028 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1029 return false; 1030 1031 auto IsHazardFn = [TRI] (MachineInstr *I) { 1032 if (SIInstrInfo::isVALU(*I)) 1033 return false; 1034 return I->readsRegister(AMDGPU::EXEC, TRI); 1035 }; 1036 1037 const SIInstrInfo *TII = ST.getInstrInfo(); 1038 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1039 if (!MI) 1040 return false; 1041 if (SIInstrInfo::isVALU(*MI)) { 1042 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1043 return true; 1044 for (auto MO : MI->implicit_operands()) 1045 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1046 return true; 1047 } 1048 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1049 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1050 return true; 1051 return false; 1052 }; 1053 1054 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1055 std::numeric_limits<int>::max()) 1056 return false; 1057 1058 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1059 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1060 .addImm(0xfffe); 1061 return true; 1062 } 1063 1064 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1065 if (!ST.hasLdsBranchVmemWARHazard()) 1066 return false; 1067 1068 auto IsHazardInst = [] (const MachineInstr *MI) { 1069 if (SIInstrInfo::isDS(*MI)) 1070 return 1; 1071 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1072 return 2; 1073 return 0; 1074 }; 1075 1076 auto InstType = IsHazardInst(MI); 1077 if (!InstType) 1078 return false; 1079 1080 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1081 return I && (IsHazardInst(I) || 1082 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1083 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1084 !I->getOperand(1).getImm())); 1085 }; 1086 1087 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1088 if (!I->isBranch()) 1089 return false; 1090 1091 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1092 auto InstType2 = IsHazardInst(I); 1093 return InstType2 && InstType != InstType2; 1094 }; 1095 1096 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1097 if (!I) 1098 return false; 1099 1100 auto InstType2 = IsHazardInst(I); 1101 if (InstType == InstType2) 1102 return true; 1103 1104 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1105 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1106 !I->getOperand(1).getImm(); 1107 }; 1108 1109 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1110 std::numeric_limits<int>::max(); 1111 }; 1112 1113 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1114 std::numeric_limits<int>::max()) 1115 return false; 1116 1117 const SIInstrInfo *TII = ST.getInstrInfo(); 1118 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1119 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1120 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1121 .addImm(0); 1122 1123 return true; 1124 } 1125 1126 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1127 int NSAtoVMEMWaitStates = 1; 1128 1129 if (!ST.hasNSAtoVMEMBug()) 1130 return 0; 1131 1132 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1133 return 0; 1134 1135 const SIInstrInfo *TII = ST.getInstrInfo(); 1136 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1137 if (!Offset || (Offset->getImm() & 6) == 0) 1138 return 0; 1139 1140 auto IsHazardFn = [TII] (MachineInstr *I) { 1141 if (!SIInstrInfo::isMIMG(*I)) 1142 return false; 1143 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1144 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1145 TII->getInstSizeInBytes(*I) >= 16; 1146 }; 1147 1148 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1149 } 1150 1151 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1152 int FPAtomicToDenormModeWaitStates = 3; 1153 1154 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1155 return 0; 1156 1157 auto IsHazardFn = [] (MachineInstr *I) { 1158 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1159 return false; 1160 return SIInstrInfo::isFPAtomic(*I); 1161 }; 1162 1163 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1164 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1165 return true; 1166 1167 switch (MI->getOpcode()) { 1168 case AMDGPU::S_WAITCNT: 1169 case AMDGPU::S_WAITCNT_VSCNT: 1170 case AMDGPU::S_WAITCNT_VMCNT: 1171 case AMDGPU::S_WAITCNT_EXPCNT: 1172 case AMDGPU::S_WAITCNT_LGKMCNT: 1173 case AMDGPU::S_WAIT_IDLE: 1174 return true; 1175 default: 1176 break; 1177 } 1178 1179 return false; 1180 }; 1181 1182 1183 return FPAtomicToDenormModeWaitStates - 1184 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1185 } 1186 1187 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1188 assert(SIInstrInfo::isMAI(*MI)); 1189 1190 int WaitStatesNeeded = 0; 1191 unsigned Opc = MI->getOpcode(); 1192 1193 auto IsVALUFn = [] (MachineInstr *MI) { 1194 return SIInstrInfo::isVALU(*MI); 1195 }; 1196 1197 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 1198 const int LegacyVALUWritesVGPRWaitStates = 2; 1199 const int VALUWritesExecWaitStates = 4; 1200 const int MaxWaitStates = 4; 1201 1202 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1203 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1204 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1205 1206 if (WaitStatesNeeded < MaxWaitStates) { 1207 for (const MachineOperand &Use : MI->explicit_uses()) { 1208 const int MaxWaitStates = 2; 1209 1210 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1211 continue; 1212 1213 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1214 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1215 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1216 1217 if (WaitStatesNeeded == MaxWaitStates) 1218 break; 1219 } 1220 } 1221 } 1222 1223 auto IsMFMAFn = [] (MachineInstr *MI) { 1224 return SIInstrInfo::isMAI(*MI) && 1225 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1226 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; 1227 }; 1228 1229 for (const MachineOperand &Op : MI->explicit_operands()) { 1230 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1231 continue; 1232 1233 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1234 continue; 1235 1236 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1237 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1238 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1239 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1240 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1241 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1242 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1243 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1244 const int MaxWaitStates = 18; 1245 Register Reg = Op.getReg(); 1246 unsigned HazardDefLatency = 0; 1247 1248 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1249 (MachineInstr *MI) { 1250 if (!IsMFMAFn(MI)) 1251 return false; 1252 Register DstReg = MI->getOperand(0).getReg(); 1253 if (DstReg == Reg) 1254 return false; 1255 HazardDefLatency = std::max(HazardDefLatency, 1256 TSchedModel.computeInstrLatency(MI)); 1257 return TRI.regsOverlap(DstReg, Reg); 1258 }; 1259 1260 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1261 MaxWaitStates); 1262 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1263 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1264 int OpNo = MI->getOperandNo(&Op); 1265 if (OpNo == SrcCIdx) { 1266 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1267 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 1268 switch (HazardDefLatency) { 1269 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1270 break; 1271 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1272 break; 1273 case 16: LLVM_FALLTHROUGH; 1274 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1275 break; 1276 } 1277 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1278 switch (HazardDefLatency) { 1279 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1280 break; 1281 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1282 break; 1283 case 16: LLVM_FALLTHROUGH; 1284 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1285 break; 1286 } 1287 } 1288 1289 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1290 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1291 1292 if (WaitStatesNeeded == MaxWaitStates) 1293 return WaitStatesNeeded; // Early exit. 1294 1295 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1296 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1297 return false; 1298 Register DstReg = MI->getOperand(0).getReg(); 1299 return TRI.regsOverlap(Reg, DstReg); 1300 }; 1301 1302 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1303 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1304 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1305 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1306 if (OpNo == SrcCIdx) 1307 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1308 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 1309 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1310 1311 WaitStatesNeededForUse = NeedWaitStates - 1312 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1313 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1314 1315 if (WaitStatesNeeded == MaxWaitStates) 1316 return WaitStatesNeeded; // Early exit. 1317 } 1318 1319 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 1320 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1321 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1322 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1323 const int MaxWaitStates = 13; 1324 Register DstReg = MI->getOperand(0).getReg(); 1325 unsigned HazardDefLatency = 0; 1326 1327 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1328 (MachineInstr *MI) { 1329 if (!IsMFMAFn(MI)) 1330 return false; 1331 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1332 HazardDefLatency = std::max(HazardDefLatency, 1333 TSchedModel.computeInstrLatency(MI)); 1334 return TRI.regsOverlap(Reg, DstReg); 1335 }; 1336 1337 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1338 int NeedWaitStates; 1339 switch (HazardDefLatency) { 1340 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1341 break; 1342 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1343 break; 1344 case 16: LLVM_FALLTHROUGH; 1345 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1346 break; 1347 } 1348 1349 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1350 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1351 } 1352 1353 return WaitStatesNeeded; 1354 } 1355 1356 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1357 if (!ST.hasMAIInsts()) 1358 return 0; 1359 1360 int WaitStatesNeeded = 0; 1361 1362 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1363 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 1364 }; 1365 1366 for (const MachineOperand &Op : MI->explicit_uses()) { 1367 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1368 continue; 1369 1370 Register Reg = Op.getReg(); 1371 1372 const int AccVgprReadLdStWaitStates = 2; 1373 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1374 const int MaxWaitStates = 2; 1375 1376 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1377 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1378 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1379 1380 if (WaitStatesNeeded == MaxWaitStates) 1381 return WaitStatesNeeded; // Early exit. 1382 1383 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { 1384 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 1385 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 1386 return false; 1387 auto IsVALUFn = [] (MachineInstr *MI) { 1388 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1389 }; 1390 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1391 std::numeric_limits<int>::max(); 1392 }; 1393 1394 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1395 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1396 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1397 } 1398 1399 return WaitStatesNeeded; 1400 } 1401 1402 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1403 if (!SU->isInstr()) 1404 return false; 1405 1406 MachineInstr *MAI = nullptr; 1407 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1408 MAI = nullptr; 1409 if (SIInstrInfo::isMAI(*MI) && 1410 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 1411 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) 1412 MAI = MI; 1413 return MAI != nullptr; 1414 }; 1415 1416 MachineInstr *MI = SU->getInstr(); 1417 if (IsMFMAFn(MI)) { 1418 int W = getWaitStatesSince(IsMFMAFn, 16); 1419 if (MAI) 1420 return W < (int)TSchedModel.computeInstrLatency(MAI); 1421 } 1422 1423 return false; 1424 } 1425