1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include "llvm/TargetParser/TargetParser.h"
21
22 using namespace llvm;
23
24 namespace {
25
26 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
MFMAPaddingRatioParser__anon585a98fd0111::MFMAPaddingRatioParser27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
parse__anon585a98fd0111::MFMAPaddingRatioParser29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38 };
39
40 } // end anonymous namespace
41
42 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47 //===----------------------------------------------------------------------===//
48 // Hazard Recognizer Implementation
49 //===----------------------------------------------------------------------===//
50
51 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
52 const GCNSubtarget &ST);
53
GCNHazardRecognizer(const MachineFunction & MF)54 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
55 IsHazardRecognizerMode(false),
56 CurrCycleInstr(nullptr),
57 MF(MF),
58 ST(MF.getSubtarget<GCNSubtarget>()),
59 TII(*ST.getInstrInfo()),
60 TRI(TII.getRegisterInfo()),
61 ClauseUses(TRI.getNumRegUnits()),
62 ClauseDefs(TRI.getNumRegUnits()) {
63 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
64 TSchedModel.init(&ST);
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66 }
67
Reset()68 void GCNHazardRecognizer::Reset() {
69 EmittedInstrs.clear();
70 }
71
EmitInstruction(SUnit * SU)72 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73 EmitInstruction(SU->getInstr());
74 }
75
EmitInstruction(MachineInstr * MI)76 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77 CurrCycleInstr = MI;
78 }
79
isDivFMas(unsigned Opcode)80 static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82 }
83
isSGetReg(unsigned Opcode)84 static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32;
86 }
87
isSSetReg(unsigned Opcode)88 static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97 }
98
isRWLane(unsigned Opcode)99 static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101 }
102
isRFE(unsigned Opcode)103 static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105 }
106
isSMovRel(unsigned Opcode)107 static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117 }
118
isDGEMM(unsigned Opcode)119 static bool isDGEMM(unsigned Opcode) {
120 return AMDGPU::getMAIIsDGEMM(Opcode);
121 }
122
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)123 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124 unsigned Opcode = MI.getOpcode();
125
126 if (!SIInstrInfo::isMAI(MI) ||
127 isDGEMM(Opcode) ||
128 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130 return false;
131
132 if (!ST.hasGFX940Insts())
133 return true;
134
135 return AMDGPU::getMAIIsGFX940XDL(Opcode);
136 }
137
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)138 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
139 const MachineInstr &MI) {
140 if (TII.isAlwaysGDS(MI.getOpcode()))
141 return true;
142
143 switch (MI.getOpcode()) {
144 case AMDGPU::S_SENDMSG:
145 case AMDGPU::S_SENDMSGHALT:
146 case AMDGPU::S_TTRACEDATA:
147 return true;
148 // These DS opcodes don't support GDS.
149 case AMDGPU::DS_NOP:
150 case AMDGPU::DS_PERMUTE_B32:
151 case AMDGPU::DS_BPERMUTE_B32:
152 return false;
153 default:
154 if (TII.isDS(MI.getOpcode())) {
155 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
156 AMDGPU::OpName::gds);
157 if (MI.getOperand(GDS).getImm())
158 return true;
159 }
160 return false;
161 }
162 }
163
isPermlane(const MachineInstr & MI)164 static bool isPermlane(const MachineInstr &MI) {
165 unsigned Opcode = MI.getOpcode();
166 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE64_B32 ||
168 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
171 }
172
isLdsDma(const MachineInstr & MI)173 static bool isLdsDma(const MachineInstr &MI) {
174 return SIInstrInfo::isVALU(MI) &&
175 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
176 }
177
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)178 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
179 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
180 AMDGPU::OpName::simm16);
181 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
182 }
183
184 ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)185 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
186 MachineInstr *MI = SU->getInstr();
187 // If we are not in "HazardRecognizerMode" and therefore not being run from
188 // the scheduler, track possible stalls from hazards but don't insert noops.
189 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190
191 if (MI->isBundle())
192 return NoHazard;
193
194 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
195 return HazardType;
196
197 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
198 return HazardType;
199
200 if (checkFPAtomicToDenormModeHazard(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNoDataDepHazard())
204 return NoHazard;
205
206 // FIXME: Should flat be considered vmem?
207 if ((SIInstrInfo::isVMEM(*MI) ||
208 SIInstrInfo::isFLAT(*MI))
209 && checkVMEMHazards(MI) > 0)
210 return HazardType;
211
212 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
213 return HazardType;
214
215 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
216 return HazardType;
217
218 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
219 return HazardType;
220
221 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
222 return HazardType;
223
224 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
225 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
226 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
227 return HazardType;
228
229 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
230 return HazardType;
231
232 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
233 return HazardType;
234
235 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
236 return HazardType;
237
238 if (((ST.hasReadM0MovRelInterpHazard() &&
239 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
240 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
242 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
243 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
244 (ST.hasReadM0LdsDirectHazard() &&
245 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
246 checkReadM0Hazards(MI) > 0)
247 return HazardType;
248
249 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
250 return HazardType;
251
252 if ((SIInstrInfo::isVMEM(*MI) ||
253 SIInstrInfo::isFLAT(*MI) ||
254 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
255 return HazardType;
256
257 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
258 return HazardType;
259
260 return NoHazard;
261 }
262
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)263 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
264 unsigned Quantity) {
265 while (Quantity > 0) {
266 unsigned Arg = std::min(Quantity, 8u);
267 Quantity -= Arg;
268 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
269 .addImm(Arg - 1);
270 }
271 }
272
273 unsigned
getMFMAPipelineWaitStates(const MachineInstr & MI) const274 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
275 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
276 assert(TSchedModel.getWriteProcResBegin(SC) !=
277 TSchedModel.getWriteProcResEnd(SC));
278 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
279 }
280
processBundle()281 void GCNHazardRecognizer::processBundle() {
282 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
283 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
284 // Check bundled MachineInstr's for hazards.
285 for (; MI != E && MI->isInsideBundle(); ++MI) {
286 CurrCycleInstr = &*MI;
287 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
288
289 if (IsHazardRecognizerMode) {
290 fixHazards(CurrCycleInstr);
291
292 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
293 }
294
295 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
296 // include the bundled MI directly after, only add a maximum of
297 // (MaxLookAhead - 1) noops to EmittedInstrs.
298 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
299 EmittedInstrs.push_front(nullptr);
300
301 EmittedInstrs.push_front(CurrCycleInstr);
302 EmittedInstrs.resize(MaxLookAhead);
303 }
304 CurrCycleInstr = nullptr;
305 }
306
runOnInstruction(MachineInstr * MI)307 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308 assert(IsHazardRecognizerMode);
309
310 unsigned NumPreNoops = PreEmitNoops(MI);
311 EmitNoops(NumPreNoops);
312 if (MI->isInsideBundle())
313 insertNoopsInBundle(MI, TII, NumPreNoops);
314 else
315 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
316 NumPreNoops);
317 EmitInstruction(MI);
318 AdvanceCycle();
319 }
320
PreEmitNoops(MachineInstr * MI)321 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
322 IsHazardRecognizerMode = true;
323 CurrCycleInstr = MI;
324 unsigned W = PreEmitNoopsCommon(MI);
325 fixHazards(MI);
326 CurrCycleInstr = nullptr;
327 return W;
328 }
329
PreEmitNoopsCommon(MachineInstr * MI)330 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
331 if (MI->isBundle())
332 return 0;
333
334 int WaitStates = 0;
335
336 if (SIInstrInfo::isSMRD(*MI))
337 return std::max(WaitStates, checkSMRDHazards(MI));
338
339 if (ST.hasNSAtoVMEMBug())
340 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
341
342 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
343
344 if (ST.hasNoDataDepHazard())
345 return WaitStates;
346
347 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
348 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
349
350 if (SIInstrInfo::isVALU(*MI))
351 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
352
353 if (SIInstrInfo::isDPP(*MI))
354 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
355
356 if (isDivFMas(MI->getOpcode()))
357 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
358
359 if (isRWLane(MI->getOpcode()))
360 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
361
362 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
363 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
364 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
365 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
366
367 if (MI->isInlineAsm())
368 return std::max(WaitStates, checkInlineAsmHazards(MI));
369
370 if (isSGetReg(MI->getOpcode()))
371 return std::max(WaitStates, checkGetRegHazards(MI));
372
373 if (isSSetReg(MI->getOpcode()))
374 return std::max(WaitStates, checkSetRegHazards(MI));
375
376 if (isRFE(MI->getOpcode()))
377 return std::max(WaitStates, checkRFEHazards(MI));
378
379 if ((ST.hasReadM0MovRelInterpHazard() &&
380 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
381 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
383 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
384 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
385 (ST.hasReadM0LdsDirectHazard() &&
386 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
387 return std::max(WaitStates, checkReadM0Hazards(MI));
388
389 if (SIInstrInfo::isMAI(*MI))
390 return std::max(WaitStates, checkMAIHazards(MI));
391
392 if (SIInstrInfo::isVMEM(*MI) ||
393 SIInstrInfo::isFLAT(*MI) ||
394 SIInstrInfo::isDS(*MI))
395 return std::max(WaitStates, checkMAILdStHazards(MI));
396
397 return WaitStates;
398 }
399
EmitNoop()400 void GCNHazardRecognizer::EmitNoop() {
401 EmittedInstrs.push_front(nullptr);
402 }
403
AdvanceCycle()404 void GCNHazardRecognizer::AdvanceCycle() {
405 // When the scheduler detects a stall, it will call AdvanceCycle() without
406 // emitting any instructions.
407 if (!CurrCycleInstr) {
408 EmittedInstrs.push_front(nullptr);
409 return;
410 }
411
412 if (CurrCycleInstr->isBundle()) {
413 processBundle();
414 return;
415 }
416
417 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
418 if (!NumWaitStates) {
419 CurrCycleInstr = nullptr;
420 return;
421 }
422
423 // Keep track of emitted instructions
424 EmittedInstrs.push_front(CurrCycleInstr);
425
426 // Add a nullptr for each additional wait state after the first. Make sure
427 // not to add more than getMaxLookAhead() items to the list, since we
428 // truncate the list to that size right after this loop.
429 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
430 i < e; ++i) {
431 EmittedInstrs.push_front(nullptr);
432 }
433
434 // getMaxLookahead() is the largest number of wait states we will ever need
435 // to insert, so there is no point in keeping track of more than that many
436 // wait states.
437 EmittedInstrs.resize(getMaxLookAhead());
438
439 CurrCycleInstr = nullptr;
440 }
441
RecedeCycle()442 void GCNHazardRecognizer::RecedeCycle() {
443 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
444 }
445
446 //===----------------------------------------------------------------------===//
447 // Helper Functions
448 //===----------------------------------------------------------------------===//
449
450 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
451
452 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
454
455 // Search for a hazard in a block and its predecessors.
456 template <typename StateT>
457 static bool
hasHazard(StateT State,function_ref<HazardFnResult (StateT &,const MachineInstr &)> IsHazard,function_ref<void (StateT &,const MachineInstr &)> UpdateState,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,DenseSet<const MachineBasicBlock * > & Visited)458 hasHazard(StateT State,
459 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
460 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
461 const MachineBasicBlock *MBB,
462 MachineBasicBlock::const_reverse_instr_iterator I,
463 DenseSet<const MachineBasicBlock *> &Visited) {
464 for (auto E = MBB->instr_rend(); I != E; ++I) {
465 // No need to look at parent BUNDLE instructions.
466 if (I->isBundle())
467 continue;
468
469 switch (IsHazard(State, *I)) {
470 case HazardFound:
471 return true;
472 case HazardExpired:
473 return false;
474 default:
475 // Continue search
476 break;
477 }
478
479 if (I->isInlineAsm() || I->isMetaInstruction())
480 continue;
481
482 UpdateState(State, *I);
483 }
484
485 for (MachineBasicBlock *Pred : MBB->predecessors()) {
486 if (!Visited.insert(Pred).second)
487 continue;
488
489 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
490 Visited))
491 return true;
492 }
493
494 return false;
495 }
496
497 // Returns a minimum wait states since \p I walking all predecessors.
498 // Only scans until \p IsExpired does not return true.
499 // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited,GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)500 static int getWaitStatesSince(
501 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
502 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
503 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
504 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
505 for (auto E = MBB->instr_rend(); I != E; ++I) {
506 // Don't add WaitStates for parent BUNDLE instructions.
507 if (I->isBundle())
508 continue;
509
510 if (IsHazard(*I))
511 return WaitStates;
512
513 if (I->isInlineAsm())
514 continue;
515
516 WaitStates += GetNumWaitStates(*I);
517
518 if (IsExpired(*I, WaitStates))
519 return std::numeric_limits<int>::max();
520 }
521
522 int MinWaitStates = std::numeric_limits<int>::max();
523 for (MachineBasicBlock *Pred : MBB->predecessors()) {
524 if (!Visited.insert(Pred).second)
525 continue;
526
527 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
528 IsExpired, Visited, GetNumWaitStates);
529
530 MinWaitStates = std::min(MinWaitStates, W);
531 }
532
533 return MinWaitStates;
534 }
535
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)536 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
537 const MachineInstr *MI, IsExpiredFn IsExpired) {
538 DenseSet<const MachineBasicBlock *> Visited;
539 return getWaitStatesSince(IsHazard, MI->getParent(),
540 std::next(MI->getReverseIterator()),
541 0, IsExpired, Visited);
542 }
543
getWaitStatesSince(IsHazardFn IsHazard,int Limit)544 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
545 if (IsHazardRecognizerMode) {
546 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
547 return WaitStates >= Limit;
548 };
549 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
550 }
551
552 int WaitStates = 0;
553 for (MachineInstr *MI : EmittedInstrs) {
554 if (MI) {
555 if (IsHazard(*MI))
556 return WaitStates;
557
558 if (MI->isInlineAsm())
559 continue;
560 }
561 ++WaitStates;
562
563 if (WaitStates >= Limit)
564 break;
565 }
566 return std::numeric_limits<int>::max();
567 }
568
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)569 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
570 IsHazardFn IsHazardDef,
571 int Limit) {
572 const SIRegisterInfo *TRI = ST.getRegisterInfo();
573
574 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
576 };
577
578 return getWaitStatesSince(IsHazardFn, Limit);
579 }
580
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)581 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
582 int Limit) {
583 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
585 };
586
587 return getWaitStatesSince(IsHazardFn, Limit);
588 }
589
590 //===----------------------------------------------------------------------===//
591 // No-op Hazard Detection
592 //===----------------------------------------------------------------------===//
593
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)594 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595 MCRegister Reg) {
596 for (MCRegUnit Unit : TRI.regunits(Reg))
597 BV.set(Unit);
598 }
599
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & DefSet,BitVector & UseSet)600 static void addRegsToSet(const SIRegisterInfo &TRI,
601 iterator_range<MachineInstr::const_mop_iterator> Ops,
602 BitVector &DefSet, BitVector &UseSet) {
603 for (const MachineOperand &Op : Ops) {
604 if (Op.isReg())
605 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
606 }
607 }
608
addClauseInst(const MachineInstr & MI)609 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
610 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
611 }
612
breaksSMEMSoftClause(MachineInstr * MI)613 static bool breaksSMEMSoftClause(MachineInstr *MI) {
614 return !SIInstrInfo::isSMRD(*MI);
615 }
616
breaksVMEMSoftClause(MachineInstr * MI)617 static bool breaksVMEMSoftClause(MachineInstr *MI) {
618 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
619 }
620
checkSoftClauseHazards(MachineInstr * MEM)621 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
622 // SMEM soft clause are only present on VI+, and only matter if xnack is
623 // enabled.
624 if (!ST.isXNACKEnabled())
625 return 0;
626
627 bool IsSMRD = TII.isSMRD(*MEM);
628
629 resetClause();
630
631 // A soft-clause is any group of consecutive SMEM instructions. The
632 // instructions in this group may return out of order and/or may be
633 // replayed (i.e. the same instruction issued more than once).
634 //
635 // In order to handle these situations correctly we need to make sure that
636 // when a clause has more than one instruction, no instruction in the clause
637 // writes to a register that is read by another instruction in the clause
638 // (including itself). If we encounter this situation, we need to break the
639 // clause by inserting a non SMEM instruction.
640
641 for (MachineInstr *MI : EmittedInstrs) {
642 // When we hit a non-SMEM instruction then we have passed the start of the
643 // clause and we can stop.
644 if (!MI)
645 break;
646
647 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
648 break;
649
650 addClauseInst(*MI);
651 }
652
653 if (ClauseDefs.none())
654 return 0;
655
656 // We need to make sure not to put loads and stores in the same clause if they
657 // use the same address. For now, just start a new clause whenever we see a
658 // store.
659 if (MEM->mayStore())
660 return 1;
661
662 addClauseInst(*MEM);
663
664 // If the set of defs and uses intersect then we cannot add this instruction
665 // to the clause, so we have a hazard.
666 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
667 }
668
checkSMRDHazards(MachineInstr * SMRD)669 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
670 int WaitStatesNeeded = 0;
671
672 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
673
674 // This SMRD hazard only affects SI.
675 if (!ST.hasSMRDReadVALUDefHazard())
676 return WaitStatesNeeded;
677
678 // A read of an SGPR by SMRD instruction requires 4 wait states when the
679 // SGPR was written by a VALU instruction.
680 int SmrdSgprWaitStates = 4;
681 auto IsHazardDefFn = [this](const MachineInstr &MI) {
682 return TII.isVALU(MI);
683 };
684 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685 return TII.isSALU(MI);
686 };
687
688 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
689
690 for (const MachineOperand &Use : SMRD->uses()) {
691 if (!Use.isReg())
692 continue;
693 int WaitStatesNeededForUse =
694 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
695 SmrdSgprWaitStates);
696 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
697
698 // This fixes what appears to be undocumented hardware behavior in SI where
699 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
700 // needs some number of nops in between. We don't know how many we need, but
701 // let's use 4. This wasn't discovered before probably because the only
702 // case when this happens is when we expand a 64-bit pointer into a full
703 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
704 // probably never encountered in the closed-source land.
705 if (IsBufferSMRD) {
706 int WaitStatesNeededForUse =
707 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
708 IsBufferHazardDefFn,
709 SmrdSgprWaitStates);
710 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
711 }
712 }
713
714 return WaitStatesNeeded;
715 }
716
checkVMEMHazards(MachineInstr * VMEM)717 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
718 if (!ST.hasVMEMReadSGPRVALUDefHazard())
719 return 0;
720
721 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
722
723 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
724 // SGPR was written by a VALU Instruction.
725 const int VmemSgprWaitStates = 5;
726 auto IsHazardDefFn = [this](const MachineInstr &MI) {
727 return TII.isVALU(MI);
728 };
729 for (const MachineOperand &Use : VMEM->uses()) {
730 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
731 continue;
732
733 int WaitStatesNeededForUse =
734 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
735 VmemSgprWaitStates);
736 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
737 }
738 return WaitStatesNeeded;
739 }
740
checkDPPHazards(MachineInstr * DPP)741 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
742 const SIRegisterInfo *TRI = ST.getRegisterInfo();
743 const SIInstrInfo *TII = ST.getInstrInfo();
744
745 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
746 int DppVgprWaitStates = 2;
747 int DppExecWaitStates = 5;
748 int WaitStatesNeeded = 0;
749 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750 return TII->isVALU(MI);
751 };
752
753 for (const MachineOperand &Use : DPP->uses()) {
754 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
755 continue;
756 int WaitStatesNeededForUse =
757 DppVgprWaitStates - getWaitStatesSinceDef(
758 Use.getReg(),
759 [](const MachineInstr &) { return true; },
760 DppVgprWaitStates);
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762 }
763
764 WaitStatesNeeded = std::max(
765 WaitStatesNeeded,
766 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
767 DppExecWaitStates));
768
769 return WaitStatesNeeded;
770 }
771
checkDivFMasHazards(MachineInstr * DivFMas)772 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
773 const SIInstrInfo *TII = ST.getInstrInfo();
774
775 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
776 // instruction.
777 const int DivFMasWaitStates = 4;
778 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779 return TII->isVALU(MI);
780 };
781 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
782 DivFMasWaitStates);
783
784 return DivFMasWaitStates - WaitStatesNeeded;
785 }
786
checkGetRegHazards(MachineInstr * GetRegInstr)787 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
788 const SIInstrInfo *TII = ST.getInstrInfo();
789 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
790
791 const int GetRegWaitStates = 2;
792 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793 return GetRegHWReg == getHWReg(TII, MI);
794 };
795 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
796
797 return GetRegWaitStates - WaitStatesNeeded;
798 }
799
checkSetRegHazards(MachineInstr * SetRegInstr)800 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
801 const SIInstrInfo *TII = ST.getInstrInfo();
802 unsigned HWReg = getHWReg(TII, *SetRegInstr);
803
804 const int SetRegWaitStates = ST.getSetRegWaitStates();
805 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806 return HWReg == getHWReg(TII, MI);
807 };
808 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
809 return SetRegWaitStates - WaitStatesNeeded;
810 }
811
createsVALUHazard(const MachineInstr & MI)812 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
813 if (!MI.mayStore())
814 return -1;
815
816 const SIInstrInfo *TII = ST.getInstrInfo();
817 unsigned Opcode = MI.getOpcode();
818 const MCInstrDesc &Desc = MI.getDesc();
819
820 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
821 int VDataRCID = -1;
822 if (VDataIdx != -1)
823 VDataRCID = Desc.operands()[VDataIdx].RegClass;
824
825 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
826 // There is no hazard if the instruction does not use vector regs
827 // (like wbinvl1)
828 if (VDataIdx == -1)
829 return -1;
830 // For MUBUF/MTBUF instructions this hazard only exists if the
831 // instruction is not using a register in the soffset field.
832 const MachineOperand *SOffset =
833 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
834 // If we have no soffset operand, then assume this field has been
835 // hardcoded to zero.
836 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
837 (!SOffset || !SOffset->isReg()))
838 return VDataIdx;
839 }
840
841 // MIMG instructions create a hazard if they don't use a 256-bit T# and
842 // the store size is greater than 8 bytes and they have more than two bits
843 // of their dmask set.
844 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
845 if (TII->isMIMG(MI)) {
846 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
847 assert(SRsrcIdx != -1 &&
848 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
849 (void)SRsrcIdx;
850 }
851
852 if (TII->isFLAT(MI)) {
853 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
854 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
855 return DataIdx;
856 }
857
858 return -1;
859 }
860
861 int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)862 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
863 const MachineRegisterInfo &MRI) {
864 // Helper to check for the hazard where VMEM instructions that store more than
865 // 8 bytes can have there store data over written by the next instruction.
866 const SIRegisterInfo *TRI = ST.getRegisterInfo();
867
868 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
869 int WaitStatesNeeded = 0;
870
871 if (!TRI->isVectorRegister(MRI, Def.getReg()))
872 return WaitStatesNeeded;
873 Register Reg = Def.getReg();
874 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875 int DataIdx = createsVALUHazard(MI);
876 return DataIdx >= 0 &&
877 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
878 };
879 int WaitStatesNeededForDef =
880 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
881 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
882
883 return WaitStatesNeeded;
884 }
885
checkVALUHazards(MachineInstr * VALU)886 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
887 int WaitStatesNeeded = 0;
888
889 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
890 const int TransDefWaitstates = 1;
891
892 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
893 if (!SIInstrInfo::isTRANS(MI))
894 return false;
895 const SIRegisterInfo *TRI = ST.getRegisterInfo();
896 const SIInstrInfo *TII = ST.getInstrInfo();
897 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
898
899 for (const MachineOperand &Use : VALU->explicit_uses()) {
900 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
901 return true;
902 }
903
904 return false;
905 };
906
907 int WaitStatesNeededForDef =
908 TransDefWaitstates -
909 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
910 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
911 }
912
913 if (ST.hasDstSelForwardingHazard()) {
914 const int Shift16DefWaitstates = 1;
915
916 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
917 if (!SIInstrInfo::isVALU(MI))
918 return false;
919 const SIInstrInfo *TII = ST.getInstrInfo();
920 if (SIInstrInfo::isSDWA(MI)) {
921 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
922 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
923 return false;
924 } else {
925 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
926 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
927 ->getImm() &
928 SISrcMods::DST_OP_SEL))
929 return false;
930 }
931 const SIRegisterInfo *TRI = ST.getRegisterInfo();
932 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
933 Register Def = Dst->getReg();
934
935 for (const MachineOperand &Use : VALU->explicit_uses()) {
936 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
937 return true;
938 }
939 }
940
941 return false;
942 };
943
944 int WaitStatesNeededForDef =
945 Shift16DefWaitstates -
946 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
947 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
948 }
949
950 if (ST.hasVDecCoExecHazard()) {
951 const int VALUWriteSGPRVALUReadWaitstates = 2;
952 const int VALUWriteEXECRWLane = 4;
953 const int VALUWriteVGPRReadlaneRead = 1;
954
955 const SIRegisterInfo *TRI = ST.getRegisterInfo();
956 const MachineRegisterInfo &MRI = MF.getRegInfo();
957 Register UseReg;
958 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
959 if (!SIInstrInfo::isVALU(MI))
960 return false;
961 return MI.modifiesRegister(UseReg, TRI);
962 };
963
964 for (const MachineOperand &Use : VALU->explicit_uses()) {
965 if (!Use.isReg())
966 continue;
967
968 UseReg = Use.getReg();
969 if (TRI->isSGPRReg(MRI, UseReg)) {
970 int WaitStatesNeededForDef =
971 VALUWriteSGPRVALUReadWaitstates -
972 getWaitStatesSince(IsVALUDefSGPRFn,
973 VALUWriteSGPRVALUReadWaitstates);
974 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
975 }
976 }
977
978 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
979 UseReg = AMDGPU::VCC;
980 int WaitStatesNeededForDef =
981 VALUWriteSGPRVALUReadWaitstates -
982 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
983 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
984 }
985
986 switch (VALU->getOpcode()) {
987 case AMDGPU::V_READLANE_B32:
988 case AMDGPU::V_READFIRSTLANE_B32: {
989 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
990 UseReg = Src->getReg();
991 int WaitStatesNeededForDef =
992 VALUWriteVGPRReadlaneRead -
993 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
994 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
995 }
996 [[fallthrough]];
997 case AMDGPU::V_WRITELANE_B32: {
998 UseReg = AMDGPU::EXEC;
999 int WaitStatesNeededForDef =
1000 VALUWriteEXECRWLane -
1001 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1002 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1003 break;
1004 }
1005 default:
1006 break;
1007 }
1008 }
1009
1010 // This checks for the hazard where VMEM instructions that store more than
1011 // 8 bytes can have there store data over written by the next instruction.
1012 if (!ST.has12DWordStoreHazard())
1013 return WaitStatesNeeded;
1014
1015 const MachineRegisterInfo &MRI = MF.getRegInfo();
1016
1017 for (const MachineOperand &Def : VALU->defs()) {
1018 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1019 }
1020
1021 return WaitStatesNeeded;
1022 }
1023
checkInlineAsmHazards(MachineInstr * IA)1024 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1025 // This checks for hazards associated with inline asm statements.
1026 // Since inline asms can contain just about anything, we use this
1027 // to call/leverage other check*Hazard routines. Note that
1028 // this function doesn't attempt to address all possible inline asm
1029 // hazards (good luck), but is a collection of what has been
1030 // problematic thus far.
1031
1032 // see checkVALUHazards()
1033 if (!ST.has12DWordStoreHazard())
1034 return 0;
1035
1036 const MachineRegisterInfo &MRI = MF.getRegInfo();
1037 int WaitStatesNeeded = 0;
1038
1039 for (const MachineOperand &Op :
1040 llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1041 if (Op.isReg() && Op.isDef()) {
1042 WaitStatesNeeded =
1043 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1044 }
1045 }
1046
1047 return WaitStatesNeeded;
1048 }
1049
checkRWLaneHazards(MachineInstr * RWLane)1050 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1051 const SIInstrInfo *TII = ST.getInstrInfo();
1052 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1053 const MachineRegisterInfo &MRI = MF.getRegInfo();
1054
1055 const MachineOperand *LaneSelectOp =
1056 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1057
1058 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1059 return 0;
1060
1061 Register LaneSelectReg = LaneSelectOp->getReg();
1062 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1063
1064 const int RWLaneWaitStates = 4;
1065 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1066 RWLaneWaitStates);
1067 return RWLaneWaitStates - WaitStatesSince;
1068 }
1069
checkRFEHazards(MachineInstr * RFE)1070 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1071 if (!ST.hasRFEHazards())
1072 return 0;
1073
1074 const SIInstrInfo *TII = ST.getInstrInfo();
1075
1076 const int RFEWaitStates = 1;
1077
1078 auto IsHazardFn = [TII](const MachineInstr &MI) {
1079 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1080 };
1081 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1082 return RFEWaitStates - WaitStatesNeeded;
1083 }
1084
checkReadM0Hazards(MachineInstr * MI)1085 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1086 const SIInstrInfo *TII = ST.getInstrInfo();
1087 const int ReadM0WaitStates = 1;
1088 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1089 return ReadM0WaitStates -
1090 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1091 }
1092
fixHazards(MachineInstr * MI)1093 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1094 fixVMEMtoScalarWriteHazards(MI);
1095 fixVcmpxPermlaneHazards(MI);
1096 fixSMEMtoVectorWriteHazards(MI);
1097 fixVcmpxExecWARHazard(MI);
1098 fixLdsBranchVmemWARHazard(MI);
1099 if (ST.hasLdsDirect()) {
1100 fixLdsDirectVALUHazard(MI);
1101 fixLdsDirectVMEMHazard(MI);
1102 }
1103 fixVALUPartialForwardingHazard(MI);
1104 fixVALUTransUseHazard(MI);
1105 fixWMMAHazards(MI);
1106 fixShift64HighRegBug(MI);
1107 fixVALUMaskWriteHazard(MI);
1108 fixRequiredExportPriority(MI);
1109 }
1110
fixVcmpxPermlaneHazards(MachineInstr * MI)1111 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1112 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1113 return false;
1114
1115 const SIInstrInfo *TII = ST.getInstrInfo();
1116 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1117 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1118 return (TII->isVOPC(MI) ||
1119 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1120 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1121 };
1122
1123 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1124 unsigned Opc = MI.getOpcode();
1125 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1126 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1127 };
1128
1129 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1130 std::numeric_limits<int>::max())
1131 return false;
1132
1133 // V_NOP will be discarded by SQ.
1134 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1135 // which is always a VGPR and available.
1136 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1137 Register Reg = Src0->getReg();
1138 bool IsUndef = Src0->isUndef();
1139 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1140 TII->get(AMDGPU::V_MOV_B32_e32))
1141 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1142 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1143
1144 return true;
1145 }
1146
fixVMEMtoScalarWriteHazards(MachineInstr * MI)1147 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1148 if (!ST.hasVMEMtoScalarWriteHazard())
1149 return false;
1150 assert(!ST.hasExtendedWaitCounts());
1151
1152 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1153 return false;
1154
1155 if (MI->getNumDefs() == 0)
1156 return false;
1157
1158 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1159
1160 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1161 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1162 !SIInstrInfo::isFLAT(I))
1163 return false;
1164
1165 for (const MachineOperand &Def : MI->defs()) {
1166 const MachineOperand *Op =
1167 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1168 if (!Op)
1169 continue;
1170 return true;
1171 }
1172 return false;
1173 };
1174
1175 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1176 return SIInstrInfo::isVALU(MI) ||
1177 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178 !MI.getOperand(0).getImm()) ||
1179 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1180 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1181 };
1182
1183 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1184 std::numeric_limits<int>::max())
1185 return false;
1186
1187 const SIInstrInfo *TII = ST.getInstrInfo();
1188 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1189 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1190 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1191 return true;
1192 }
1193
fixSMEMtoVectorWriteHazards(MachineInstr * MI)1194 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1195 if (!ST.hasSMEMtoVectorWriteHazard())
1196 return false;
1197 assert(!ST.hasExtendedWaitCounts());
1198
1199 if (!SIInstrInfo::isVALU(*MI))
1200 return false;
1201
1202 unsigned SDSTName;
1203 switch (MI->getOpcode()) {
1204 case AMDGPU::V_READLANE_B32:
1205 case AMDGPU::V_READFIRSTLANE_B32:
1206 SDSTName = AMDGPU::OpName::vdst;
1207 break;
1208 default:
1209 SDSTName = AMDGPU::OpName::sdst;
1210 break;
1211 }
1212
1213 const SIInstrInfo *TII = ST.getInstrInfo();
1214 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1216 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1217 if (!SDST) {
1218 for (const auto &MO : MI->implicit_operands()) {
1219 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1220 SDST = &MO;
1221 break;
1222 }
1223 }
1224 }
1225
1226 if (!SDST)
1227 return false;
1228
1229 const Register SDSTReg = SDST->getReg();
1230 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1231 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1232 };
1233
1234 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1235 if (TII->isSALU(MI)) {
1236 switch (MI.getOpcode()) {
1237 case AMDGPU::S_SETVSKIP:
1238 case AMDGPU::S_VERSION:
1239 case AMDGPU::S_WAITCNT_VSCNT:
1240 case AMDGPU::S_WAITCNT_VMCNT:
1241 case AMDGPU::S_WAITCNT_EXPCNT:
1242 // These instructions cannot not mitigate the hazard.
1243 return false;
1244 case AMDGPU::S_WAITCNT_LGKMCNT:
1245 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1246 return (MI.getOperand(1).getImm() == 0) &&
1247 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1248 case AMDGPU::S_WAITCNT: {
1249 const int64_t Imm = MI.getOperand(0).getImm();
1250 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1251 // DsCnt corresponds to LGKMCnt here.
1252 return (Decoded.DsCnt == 0);
1253 }
1254 default:
1255 // SOPP instructions cannot mitigate the hazard.
1256 if (TII->isSOPP(MI))
1257 return false;
1258 // At this point the SALU can be assumed to mitigate the hazard
1259 // because either:
1260 // (a) it is independent of the at risk SMEM (breaking chain),
1261 // or
1262 // (b) it is dependent on the SMEM, in which case an appropriate
1263 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1264 // SMEM instruction.
1265 return true;
1266 }
1267 }
1268 return false;
1269 };
1270
1271 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1272 std::numeric_limits<int>::max())
1273 return false;
1274
1275 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1276 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1277 .addImm(0);
1278 return true;
1279 }
1280
fixVcmpxExecWARHazard(MachineInstr * MI)1281 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1282 if (!ST.hasVcmpxExecWARHazard())
1283 return false;
1284 assert(!ST.hasExtendedWaitCounts());
1285
1286 if (!SIInstrInfo::isVALU(*MI))
1287 return false;
1288
1289 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1290 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1291 return false;
1292
1293 auto IsHazardFn = [TRI](const MachineInstr &I) {
1294 if (SIInstrInfo::isVALU(I))
1295 return false;
1296 return I.readsRegister(AMDGPU::EXEC, TRI);
1297 };
1298
1299 const SIInstrInfo *TII = ST.getInstrInfo();
1300 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1301 if (SIInstrInfo::isVALU(MI)) {
1302 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1303 return true;
1304 for (auto MO : MI.implicit_operands())
1305 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1306 return true;
1307 }
1308 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1309 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1310 return true;
1311 return false;
1312 };
1313
1314 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1315 std::numeric_limits<int>::max())
1316 return false;
1317
1318 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1319 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1320 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1321 return true;
1322 }
1323
shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction & MF,const GCNSubtarget & ST)1324 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1325 const GCNSubtarget &ST) {
1326 if (!ST.hasLdsBranchVmemWARHazard())
1327 return false;
1328
1329 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1330 // instructions need to appear in the same function.
1331 bool HasLds = false;
1332 bool HasVmem = false;
1333 for (auto &MBB : MF) {
1334 for (auto &MI : MBB) {
1335 HasLds |= SIInstrInfo::isDS(MI);
1336 HasVmem |=
1337 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1338 if (HasLds && HasVmem)
1339 return true;
1340 }
1341 }
1342 return false;
1343 }
1344
isStoreCountWaitZero(const MachineInstr & I)1345 static bool isStoreCountWaitZero(const MachineInstr &I) {
1346 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1348 !I.getOperand(1).getImm();
1349 }
1350
fixLdsBranchVmemWARHazard(MachineInstr * MI)1351 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1352 if (!RunLdsBranchVmemWARHazardFixup)
1353 return false;
1354
1355 assert(ST.hasLdsBranchVmemWARHazard());
1356 assert(!ST.hasExtendedWaitCounts());
1357
1358 auto IsHazardInst = [](const MachineInstr &MI) {
1359 if (SIInstrInfo::isDS(MI))
1360 return 1;
1361 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1362 return 2;
1363 return 0;
1364 };
1365
1366 auto InstType = IsHazardInst(*MI);
1367 if (!InstType)
1368 return false;
1369
1370 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1371 return IsHazardInst(I) || isStoreCountWaitZero(I);
1372 };
1373
1374 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1375 if (!I.isBranch())
1376 return false;
1377
1378 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1379 auto InstType2 = IsHazardInst(I);
1380 return InstType2 && InstType != InstType2;
1381 };
1382
1383 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1384 auto InstType2 = IsHazardInst(I);
1385 if (InstType == InstType2)
1386 return true;
1387
1388 return isStoreCountWaitZero(I);
1389 };
1390
1391 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1392 std::numeric_limits<int>::max();
1393 };
1394
1395 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1396 std::numeric_limits<int>::max())
1397 return false;
1398
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1401 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1402 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1403 .addImm(0);
1404
1405 return true;
1406 }
1407
fixLdsDirectVALUHazard(MachineInstr * MI)1408 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1409 if (!SIInstrInfo::isLDSDIR(*MI))
1410 return false;
1411
1412 const int NoHazardWaitStates = 15;
1413 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1414 const Register VDSTReg = VDST->getReg();
1415
1416 bool VisitedTrans = false;
1417 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1418 if (!SIInstrInfo::isVALU(I))
1419 return false;
1420 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1421 // Cover both WAR and WAW
1422 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1423 };
1424 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1425 if (WaitStates >= NoHazardWaitStates)
1426 return true;
1427 // Instructions which cause va_vdst==0 expire hazard
1428 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1429 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1430 };
1431 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1432 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1433 };
1434
1435 DenseSet<const MachineBasicBlock *> Visited;
1436 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1437 std::next(MI->getReverseIterator()), 0,
1438 IsExpiredFn, Visited, GetWaitStatesFn);
1439
1440 // Transcendentals can execute in parallel to other VALUs.
1441 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1442 if (VisitedTrans)
1443 Count = 0;
1444
1445 MachineOperand *WaitVdstOp =
1446 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1447 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1448
1449 return true;
1450 }
1451
fixLdsDirectVMEMHazard(MachineInstr * MI)1452 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1453 if (!SIInstrInfo::isLDSDIR(*MI))
1454 return false;
1455
1456 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1457 const Register VDSTReg = VDST->getReg();
1458
1459 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1460 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1461 !SIInstrInfo::isDS(I))
1462 return false;
1463 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1464 };
1465 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1466 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1467 // according to the type of VMEM instruction.
1468 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1469 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1470 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1471 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1473 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1474 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1475 };
1476
1477 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1478 std::numeric_limits<int>::max())
1479 return false;
1480
1481 if (LdsdirCanWait) {
1482 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1483 } else {
1484 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1485 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1486 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1487 }
1488
1489 return true;
1490 }
1491
fixVALUPartialForwardingHazard(MachineInstr * MI)1492 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1493 if (!ST.hasVALUPartialForwardingHazard())
1494 return false;
1495 assert(!ST.hasExtendedWaitCounts());
1496
1497 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1498 return false;
1499
1500 SmallSetVector<Register, 4> SrcVGPRs;
1501
1502 for (const MachineOperand &Use : MI->explicit_uses()) {
1503 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1504 SrcVGPRs.insert(Use.getReg());
1505 }
1506
1507 // Only applies with >= 2 unique VGPR sources
1508 if (SrcVGPRs.size() <= 1)
1509 return false;
1510
1511 // Look for the following pattern:
1512 // Va <- VALU [PreExecPos]
1513 // intv1
1514 // Exec <- SALU [ExecPos]
1515 // intv2
1516 // Vb <- VALU [PostExecPos]
1517 // intv3
1518 // MI Va, Vb (WaitState = 0)
1519 //
1520 // Where:
1521 // intv1 + intv2 <= 2 VALUs
1522 // intv3 <= 4 VALUs
1523 //
1524 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1525
1526 const int Intv1plus2MaxVALUs = 2;
1527 const int Intv3MaxVALUs = 4;
1528 const int IntvMaxVALUs = 6;
1529 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1530
1531 struct StateType {
1532 SmallDenseMap<Register, int, 4> DefPos;
1533 int ExecPos = std::numeric_limits<int>::max();
1534 int VALUs = 0;
1535 };
1536
1537 StateType State;
1538
1539 // This overloads expiry testing with all the hazard detection
1540 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1541 // Too many VALU states have passed
1542 if (State.VALUs > NoHazardVALUWaitStates)
1543 return HazardExpired;
1544
1545 // Instructions which cause va_vdst==0 expire hazard
1546 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1547 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1548 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1549 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1550 return HazardExpired;
1551
1552 // Track registers writes
1553 bool Changed = false;
1554 if (SIInstrInfo::isVALU(I)) {
1555 for (Register Src : SrcVGPRs) {
1556 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1557 State.DefPos[Src] = State.VALUs;
1558 Changed = true;
1559 }
1560 }
1561 } else if (SIInstrInfo::isSALU(I)) {
1562 if (State.ExecPos == std::numeric_limits<int>::max()) {
1563 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1564 State.ExecPos = State.VALUs;
1565 Changed = true;
1566 }
1567 }
1568 }
1569
1570 // Early expiration: too many VALUs in intv3
1571 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1572 return HazardExpired;
1573
1574 // Only evaluate state if something changed
1575 if (!Changed)
1576 return NoHazardFound;
1577
1578 // Determine positions of VALUs pre/post exec change
1579 if (State.ExecPos == std::numeric_limits<int>::max())
1580 return NoHazardFound;
1581
1582 int PreExecPos = std::numeric_limits<int>::max();
1583 int PostExecPos = std::numeric_limits<int>::max();
1584
1585 for (auto Entry : State.DefPos) {
1586 int DefVALUs = Entry.second;
1587 if (DefVALUs != std::numeric_limits<int>::max()) {
1588 if (DefVALUs >= State.ExecPos)
1589 PreExecPos = std::min(PreExecPos, DefVALUs);
1590 else
1591 PostExecPos = std::min(PostExecPos, DefVALUs);
1592 }
1593 }
1594
1595 // Need a VALUs post exec change
1596 if (PostExecPos == std::numeric_limits<int>::max())
1597 return NoHazardFound;
1598
1599 // Too many VALUs in intv3?
1600 int Intv3VALUs = PostExecPos;
1601 if (Intv3VALUs > Intv3MaxVALUs)
1602 return HazardExpired;
1603
1604 // Too many VALUs in intv2?
1605 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1606 if (Intv2VALUs > Intv1plus2MaxVALUs)
1607 return HazardExpired;
1608
1609 // Need a VALUs pre exec change
1610 if (PreExecPos == std::numeric_limits<int>::max())
1611 return NoHazardFound;
1612
1613 // Too many VALUs in intv1?
1614 int Intv1VALUs = PreExecPos - State.ExecPos;
1615 if (Intv1VALUs > Intv1plus2MaxVALUs)
1616 return HazardExpired;
1617
1618 // Too many VALUs in intv1 + intv2
1619 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1620 return HazardExpired;
1621
1622 return HazardFound;
1623 };
1624 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1625 if (SIInstrInfo::isVALU(MI))
1626 State.VALUs += 1;
1627 };
1628
1629 DenseSet<const MachineBasicBlock *> Visited;
1630 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1631 std::next(MI->getReverseIterator()), Visited))
1632 return false;
1633
1634 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1635 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1636 .addImm(0x0fff);
1637
1638 return true;
1639 }
1640
fixVALUTransUseHazard(MachineInstr * MI)1641 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1642 if (!ST.hasVALUTransUseHazard())
1643 return false;
1644 assert(!ST.hasExtendedWaitCounts());
1645
1646 if (!SIInstrInfo::isVALU(*MI))
1647 return false;
1648
1649 SmallSet<Register, 4> SrcVGPRs;
1650
1651 for (const MachineOperand &Use : MI->explicit_uses()) {
1652 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1653 SrcVGPRs.insert(Use.getReg());
1654 }
1655
1656 // Look for the following pattern:
1657 // Va <- TRANS VALU
1658 // intv
1659 // MI Va (WaitState = 0)
1660 //
1661 // Where:
1662 // intv <= 5 VALUs / 1 TRANS
1663 //
1664 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1665
1666 const int IntvMaxVALUs = 5;
1667 const int IntvMaxTRANS = 1;
1668
1669 struct StateType {
1670 int VALUs = 0;
1671 int TRANS = 0;
1672 };
1673
1674 StateType State;
1675
1676 // This overloads expiry testing with all the hazard detection
1677 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1678 // Too many VALU states have passed
1679 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1680 return HazardExpired;
1681
1682 // Instructions which cause va_vdst==0 expire hazard
1683 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1684 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1685 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1686 I.getOperand(0).getImm() == 0x0fff))
1687 return HazardExpired;
1688
1689 // Track registers writes
1690 if (SIInstrInfo::isTRANS(I)) {
1691 for (Register Src : SrcVGPRs) {
1692 if (I.modifiesRegister(Src, &TRI)) {
1693 return HazardFound;
1694 }
1695 }
1696 }
1697
1698 return NoHazardFound;
1699 };
1700 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1701 if (SIInstrInfo::isVALU(MI))
1702 State.VALUs += 1;
1703 if (SIInstrInfo::isTRANS(MI))
1704 State.TRANS += 1;
1705 };
1706
1707 DenseSet<const MachineBasicBlock *> Visited;
1708 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1709 std::next(MI->getReverseIterator()), Visited))
1710 return false;
1711
1712 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1713 // avoided.
1714 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1715 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1716 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1717
1718 return true;
1719 }
1720
fixWMMAHazards(MachineInstr * MI)1721 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1722 if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1723 return false;
1724
1725 const SIInstrInfo *TII = ST.getInstrInfo();
1726 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1727
1728 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1729 if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1730 return false;
1731
1732 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733 // with the dest(matrix D) of the previous wmma.
1734 const Register CurSrc0Reg =
1735 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1736 const Register CurSrc1Reg =
1737 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1738
1739 const Register PrevDstReg =
1740 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1741
1742 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1743 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1744 return true;
1745 }
1746
1747 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748 // but Index can't overlap with PrevDstReg.
1749 if (AMDGPU::isGFX12Plus(ST)) {
1750 if (SIInstrInfo::isSWMMAC(*MI)) {
1751 const Register CurIndex =
1752 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1753 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1754 return true;
1755 }
1756 return false;
1757 }
1758
1759 return false;
1760 };
1761
1762 auto IsExpiredFn = [](const MachineInstr &I, int) {
1763 return SIInstrInfo::isVALU(I);
1764 };
1765
1766 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1767 std::numeric_limits<int>::max())
1768 return false;
1769
1770 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1771
1772 return true;
1773 }
1774
fixShift64HighRegBug(MachineInstr * MI)1775 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1776 if (!ST.hasShift64HighRegBug())
1777 return false;
1778 assert(!ST.hasExtendedWaitCounts());
1779
1780 switch (MI->getOpcode()) {
1781 default:
1782 return false;
1783 case AMDGPU::V_LSHLREV_B64_e64:
1784 case AMDGPU::V_LSHRREV_B64_e64:
1785 case AMDGPU::V_ASHRREV_I64_e64:
1786 break;
1787 }
1788
1789 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1790 if (!Amt->isReg())
1791 return false;
1792
1793 Register AmtReg = Amt->getReg();
1794 const MachineRegisterInfo &MRI = MF.getRegInfo();
1795 // Check if this is a last VGPR in the allocation block.
1796 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1797 return false;
1798
1799 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1800 return false;
1801
1802 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1803 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1804 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1805 bool Overlapped = OverlappedSrc || OverlappedDst;
1806
1807 assert(!OverlappedDst || !OverlappedSrc ||
1808 Src1->getReg() == MI->getOperand(0).getReg());
1809 assert(ST.needsAlignedVGPRs());
1810 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1811
1812 Register NewReg;
1813 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814 : AMDGPU::VGPR_32RegClass) {
1815 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1816 NewReg = Reg;
1817 break;
1818 }
1819 }
1820
1821 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1822 : NewReg;
1823 Register NewAmtLo;
1824
1825 if (Overlapped)
1826 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1827
1828 DebugLoc DL = MI->getDebugLoc();
1829 MachineBasicBlock *MBB = MI->getParent();
1830 // Insert a full wait count because found register might be pending a wait.
1831 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1832 .addImm(0);
1833
1834 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1835 if (Overlapped)
1836 runOnInstruction(
1837 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1838 .addDef(AmtReg - 1)
1839 .addReg(AmtReg - 1, RegState::Undef)
1840 .addReg(NewAmtLo, RegState::Undef));
1841 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1842 .addDef(AmtReg)
1843 .addReg(AmtReg, RegState::Undef)
1844 .addReg(NewAmt, RegState::Undef));
1845
1846 // Instructions emitted after the current instruction will be processed by the
1847 // parent loop of the hazard recognizer in a natural way.
1848 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1849 AmtReg)
1850 .addDef(NewAmt)
1851 .addReg(NewAmt)
1852 .addReg(AmtReg);
1853 if (Overlapped)
1854 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1855 AmtReg - 1)
1856 .addDef(NewAmtLo)
1857 .addReg(NewAmtLo)
1858 .addReg(AmtReg - 1);
1859
1860 // Re-running hazard recognizer on the modified instruction is not necessary,
1861 // inserted V_SWAP_B32 has already both read and write new registers so
1862 // hazards related to these register has already been handled.
1863 Amt->setReg(NewAmt);
1864 Amt->setIsKill(false);
1865 // We do not update liveness, so verifier may see it as undef.
1866 Amt->setIsUndef();
1867 if (OverlappedDst)
1868 MI->getOperand(0).setReg(NewReg);
1869 if (OverlappedSrc) {
1870 Src1->setReg(NewReg);
1871 Src1->setIsKill(false);
1872 Src1->setIsUndef();
1873 }
1874
1875 return true;
1876 }
1877
checkNSAtoVMEMHazard(MachineInstr * MI)1878 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1879 int NSAtoVMEMWaitStates = 1;
1880
1881 if (!ST.hasNSAtoVMEMBug())
1882 return 0;
1883
1884 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1885 return 0;
1886
1887 const SIInstrInfo *TII = ST.getInstrInfo();
1888 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1889 if (!Offset || (Offset->getImm() & 6) == 0)
1890 return 0;
1891
1892 auto IsHazardFn = [TII](const MachineInstr &I) {
1893 if (!SIInstrInfo::isMIMG(I))
1894 return false;
1895 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1896 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897 TII->getInstSizeInBytes(I) >= 16;
1898 };
1899
1900 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1901 }
1902
checkFPAtomicToDenormModeHazard(MachineInstr * MI)1903 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1904 int FPAtomicToDenormModeWaitStates = 3;
1905
1906 if (!ST.hasFPAtomicToDenormModeHazard())
1907 return 0;
1908 assert(!ST.hasExtendedWaitCounts());
1909
1910 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1911 return 0;
1912
1913 auto IsHazardFn = [](const MachineInstr &I) {
1914 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1915 return false;
1916 return SIInstrInfo::isFPAtomic(I);
1917 };
1918
1919 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1920 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1921 return true;
1922
1923 switch (MI.getOpcode()) {
1924 case AMDGPU::S_WAITCNT:
1925 case AMDGPU::S_WAITCNT_VSCNT:
1926 case AMDGPU::S_WAITCNT_VMCNT:
1927 case AMDGPU::S_WAITCNT_EXPCNT:
1928 case AMDGPU::S_WAITCNT_LGKMCNT:
1929 case AMDGPU::S_WAIT_IDLE:
1930 return true;
1931 default:
1932 break;
1933 }
1934
1935 return false;
1936 };
1937
1938 return FPAtomicToDenormModeWaitStates -
1939 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1940 }
1941
checkMAIHazards(MachineInstr * MI)1942 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1943 assert(SIInstrInfo::isMAI(*MI));
1944
1945 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1946 }
1947
checkMFMAPadding(MachineInstr * MI)1948 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1949 // Early exit if no padding is requested.
1950 if (MFMAPaddingRatio == 0)
1951 return 0;
1952
1953 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1954 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1955 return 0;
1956
1957 int NeighborMFMALatency = 0;
1958 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1959 this](const MachineInstr &MI) {
1960 if (!SIInstrInfo::isMFMA(MI))
1961 return false;
1962
1963 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1964 return true;
1965 };
1966
1967 const int MaxMFMAPipelineWaitStates = 16;
1968 int WaitStatesSinceNeighborMFMA =
1969 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1970
1971 int NeighborMFMAPaddingNeeded =
1972 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1973 WaitStatesSinceNeighborMFMA;
1974
1975 return std::max(0, NeighborMFMAPaddingNeeded);
1976 }
1977
checkMAIHazards908(MachineInstr * MI)1978 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1979 int WaitStatesNeeded = 0;
1980 unsigned Opc = MI->getOpcode();
1981
1982 auto IsVALUFn = [](const MachineInstr &MI) {
1983 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1984 };
1985
1986 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1987 const int LegacyVALUWritesVGPRWaitStates = 2;
1988 const int VALUWritesExecWaitStates = 4;
1989 const int MaxWaitStates = 4;
1990
1991 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1992 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1994
1995 if (WaitStatesNeeded < MaxWaitStates) {
1996 for (const MachineOperand &Use : MI->explicit_uses()) {
1997 const int MaxWaitStates = 2;
1998
1999 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2000 continue;
2001
2002 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2003 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2005
2006 if (WaitStatesNeeded == MaxWaitStates)
2007 break;
2008 }
2009 }
2010 }
2011
2012 for (const MachineOperand &Op : MI->explicit_operands()) {
2013 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2014 continue;
2015
2016 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2017 continue;
2018
2019 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2020 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2021 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2022 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2023 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2024 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2025 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2026 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2027 const int MaxWaitStates = 18;
2028 Register Reg = Op.getReg();
2029 unsigned HazardDefLatency = 0;
2030
2031 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2032 this](const MachineInstr &MI) {
2033 if (!SIInstrInfo::isMFMA(MI))
2034 return false;
2035 Register DstReg = MI.getOperand(0).getReg();
2036 if (DstReg == Reg)
2037 return false;
2038 HazardDefLatency =
2039 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2040 return TRI.regsOverlap(DstReg, Reg);
2041 };
2042
2043 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2044 MaxWaitStates);
2045 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2046 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2047 int OpNo = Op.getOperandNo();
2048 if (OpNo == SrcCIdx) {
2049 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2051 switch (HazardDefLatency) {
2052 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2053 break;
2054 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2055 break;
2056 case 16: [[fallthrough]];
2057 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2058 break;
2059 }
2060 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2061 switch (HazardDefLatency) {
2062 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2063 break;
2064 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2065 break;
2066 case 16: [[fallthrough]];
2067 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2068 break;
2069 }
2070 }
2071
2072 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2073 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2074
2075 if (WaitStatesNeeded == MaxWaitStates)
2076 return WaitStatesNeeded; // Early exit.
2077
2078 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2079 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2080 return false;
2081 Register DstReg = MI.getOperand(0).getReg();
2082 return TRI.regsOverlap(Reg, DstReg);
2083 };
2084
2085 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2086 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2087 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2089 if (OpNo == SrcCIdx)
2090 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2092 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2093
2094 WaitStatesNeededForUse = NeedWaitStates -
2095 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2097
2098 if (WaitStatesNeeded == MaxWaitStates)
2099 return WaitStatesNeeded; // Early exit.
2100 }
2101
2102 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2103 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2104 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2105 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2106 const int MaxWaitStates = 13;
2107 Register DstReg = MI->getOperand(0).getReg();
2108 unsigned HazardDefLatency = 0;
2109
2110 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2111 this](const MachineInstr &MI) {
2112 if (!SIInstrInfo::isMFMA(MI))
2113 return false;
2114 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2115 HazardDefLatency =
2116 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2117 return TRI.regsOverlap(Reg, DstReg);
2118 };
2119
2120 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2121 int NeedWaitStates;
2122 switch (HazardDefLatency) {
2123 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2124 break;
2125 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2126 break;
2127 case 16: [[fallthrough]];
2128 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2129 break;
2130 }
2131
2132 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2133 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2134 }
2135
2136 // Pad neighboring MFMA with noops for better inter-wave performance.
2137 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2138
2139 return WaitStatesNeeded;
2140 }
2141
2142 static int
GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)2143 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2144 // 2 pass -> 3
2145 // 4 pass -> 5
2146 // 8 pass -> 9
2147 // 16 pass -> 17
2148 return NumPasses + 1;
2149 }
2150
2151 static int
GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)2152 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2153 // 2 pass -> 2
2154 // 4 pass -> 4
2155 // 8 pass -> 8
2156 // 16 pass -> 16
2157 return NumPasses;
2158 }
2159
2160 static int
GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)2161 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2162 // 2 pass -> 4
2163 // 4 pass -> 6
2164 // 8 pass -> 10
2165 // 16 pass -> 18
2166 return NumPasses + 2;
2167 }
2168
GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)2169 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2170 // 2 pass -> 5
2171 // 4 pass -> 7
2172 // 8 pass -> 11
2173 // 16 pass -> 19
2174 return NumPasses + 3;
2175 }
2176
checkMAIHazards90A(MachineInstr * MI)2177 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2178 int WaitStatesNeeded = 0;
2179 unsigned Opc = MI->getOpcode();
2180
2181 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2182 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2183 };
2184
2185 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2186 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2187 !SIInstrInfo::isDOT(MI);
2188 };
2189
2190 if (!SIInstrInfo::isMFMA(*MI))
2191 return WaitStatesNeeded;
2192
2193 const int VALUWritesExecWaitStates = 4;
2194 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2196 VALUWritesExecWaitStates);
2197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2198
2199 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2200
2201 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2202 for (const MachineOperand &Use : MI->explicit_uses()) {
2203 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2204 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2205 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2206 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2207 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2208 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2209 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2210 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2211 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2212 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2213 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2214 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2215 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2216 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2217 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2218 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2219 const int MaxWaitStates = 19;
2220
2221 if (!Use.isReg())
2222 continue;
2223 Register Reg = Use.getReg();
2224 bool FullReg;
2225 const MachineInstr *MI1;
2226
2227 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2228 this](const MachineInstr &MI) {
2229 if (!SIInstrInfo::isMFMA(MI))
2230 return false;
2231 Register DstReg = MI.getOperand(0).getReg();
2232 FullReg = (DstReg == Reg);
2233 MI1 = &MI;
2234 return TRI.regsOverlap(DstReg, Reg);
2235 };
2236
2237 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2239 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240
2241 int NumWaitStates =
2242 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2243 if (NumWaitStates == std::numeric_limits<int>::max())
2244 continue;
2245
2246 int OpNo = Use.getOperandNo();
2247 unsigned Opc1 = MI1->getOpcode();
2248 int NeedWaitStates = 0;
2249 if (OpNo == SrcCIdx) {
2250 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2251 NeedWaitStates = 0;
2252 } else if (FullReg) {
2253 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2256 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2258 else if (ST.hasGFX940Insts() &&
2259 TSchedModel.computeInstrLatency(MI1) == 2)
2260 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261 } else {
2262 switch (Opc1) {
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2265 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2266 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2267 if (!isXDL(ST, *MI))
2268 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2269 break;
2270 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2272 if (!isXDL(ST, *MI))
2273 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274 break;
2275 default:
2276 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2277 if (ST.hasGFX940Insts()) {
2278 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2279 break;
2280
2281 NeedWaitStates =
2282 isXDL(ST, *MI1)
2283 ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284 NumPasses)
2285 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2286 NumPasses);
2287 break;
2288 }
2289
2290 switch (NumPasses) {
2291 case 2:
2292 NeedWaitStates =
2293 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2295 break;
2296 case 8:
2297 NeedWaitStates =
2298 isDGEMM(Opc)
2299 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2301 break;
2302 case 16:
2303 NeedWaitStates =
2304 isDGEMM(Opc)
2305 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2307 break;
2308 default:
2309 llvm_unreachable("unexpected number of passes");
2310 }
2311 }
2312 }
2313 } else {
2314 switch (Opc1) {
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2317 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2318 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2320 break;
2321 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324 break;
2325 default:
2326 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2327
2328 if (ST.hasGFX940Insts()) {
2329 NeedWaitStates =
2330 isXDL(ST, *MI1)
2331 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2332 NumPasses)
2333 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2334 NumPasses);
2335 break;
2336 }
2337
2338 switch (NumPasses) {
2339 case 2:
2340 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2341 break;
2342 case 4:
2343 llvm_unreachable("unexpected number of passes for mfma");
2344 case 8:
2345 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2346 break;
2347 case 16:
2348 default:
2349 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350 }
2351 }
2352 }
2353 if (WaitStatesNeeded >= NeedWaitStates)
2354 continue;
2355
2356 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358
2359 if (WaitStatesNeeded == MaxWaitStates)
2360 break;
2361 }
2362
2363 // Pad neighboring MFMA with noops for better inter-wave performance.
2364 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2365
2366 return WaitStatesNeeded;
2367 }
2368
checkMAILdStHazards(MachineInstr * MI)2369 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2372 return 0;
2373
2374 int WaitStatesNeeded = 0;
2375
2376 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2378 };
2379
2380 for (const MachineOperand &Op : MI->explicit_uses()) {
2381 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2382 continue;
2383
2384 Register Reg = Op.getReg();
2385
2386 const int AccVgprReadLdStWaitStates = 2;
2387 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2388 const int MaxWaitStates = 2;
2389
2390 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2391 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2392 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2393
2394 if (WaitStatesNeeded == MaxWaitStates)
2395 return WaitStatesNeeded; // Early exit.
2396
2397 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2400 return false;
2401 auto IsVALUFn = [](const MachineInstr &MI) {
2402 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2403 };
2404 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2405 std::numeric_limits<int>::max();
2406 };
2407
2408 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2411 }
2412
2413 return WaitStatesNeeded;
2414 }
2415
GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)2416 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2417 // 2 pass -> 4
2418 // 4 pass -> 6
2419 // 8 pass -> 10
2420 // 16 pass -> 18
2421 return NumPasses + 2;
2422 }
2423
GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)2424 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2425 // 2 pass -> 5
2426 // 4 pass -> 7
2427 // 8 pass -> 11
2428 // 16 pass -> 19
2429 return NumPasses + 3;
2430 }
2431
GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)2432 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2433 // 2 pass -> 5
2434 // 4 pass -> 7
2435 // 8 pass -> 11
2436 // 16 pass -> 19
2437 return NumPasses + 3;
2438 }
2439
GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)2440 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2441 // 2 pass -> 4
2442 // 4 pass -> 6
2443 // 8 pass -> 10
2444 // 16 pass -> 18
2445 return NumPasses + 2;
2446 }
2447
checkMAIVALUHazards(MachineInstr * MI)2448 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2449 if (!ST.hasGFX90AInsts())
2450 return 0;
2451
2452 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2453 return isDGEMM(MI.getOpcode());
2454 };
2455
2456 // This is checked in checkMAIHazards90A()
2457 if (SIInstrInfo::isMFMA(*MI))
2458 return 0;
2459
2460 const MachineRegisterInfo &MRI = MF.getRegInfo();
2461
2462 int WaitStatesNeeded = 0;
2463
2464 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2465 SIInstrInfo::isFLAT(*MI) ||
2466 SIInstrInfo::isDS(*MI);
2467 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2468 bool IsVALU = SIInstrInfo::isVALU(*MI);
2469
2470 const MachineInstr *MFMA = nullptr;
2471 unsigned Reg;
2472 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2473 if (!SIInstrInfo::isMFMA(MI) ||
2474 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2475 return false;
2476 MFMA = &MI;
2477 return true;
2478 };
2479
2480 const MachineInstr *DOT = nullptr;
2481 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2482 if (!SIInstrInfo::isDOT(MI) ||
2483 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2484 return false;
2485 DOT = &MI;
2486 return true;
2487 };
2488
2489 bool DGEMMAfterVALUWrite = false;
2490 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2491 // Found DGEMM on reverse traversal to def.
2492 if (isDGEMM(MI.getOpcode()))
2493 DGEMMAfterVALUWrite = true;
2494
2495 // Only hazard if register is defined by a VALU and a DGEMM is found after
2496 // after the def.
2497 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2498 return false;
2499
2500 return true;
2501 };
2502
2503 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2504 AMDGPU::OpName::src2);
2505
2506 if (IsMemOrExport || IsVALU) {
2507 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2508 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2509 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2510 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2511 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2512 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2513 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2514 const int DotWriteSameDotReadSrcAB = 3;
2515 const int DotWriteDifferentVALURead = 3;
2516 const int DMFMABetweenVALUWriteVMEMRead = 2;
2517 const int MaxWaitStates = 19;
2518
2519 for (const MachineOperand &Use : MI->explicit_uses()) {
2520 if (!Use.isReg())
2521 continue;
2522 Reg = Use.getReg();
2523
2524 DOT = nullptr;
2525 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2526 MaxWaitStates);
2527 if (DOT) {
2528 int NeedWaitStates = 0;
2529 if (DOT->getOpcode() == MI->getOpcode()) {
2530 if (&Use - &MI->getOperand(0) != SrcCIdx)
2531 NeedWaitStates = DotWriteSameDotReadSrcAB;
2532 } else {
2533 NeedWaitStates = DotWriteDifferentVALURead;
2534 }
2535
2536 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2538 }
2539
2540 // Workaround for HW data hazard bug observed only in GFX90A. When there
2541 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542 // causes the SQ to incorrectly not insert two wait states between the two
2543 // instructions needed to avoid data hazard.
2544 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2545 DGEMMAfterVALUWrite = false;
2546 if (TRI.isVectorRegister(MRI, Reg)) {
2547 int WaitStatesNeededForUse =
2548 DMFMABetweenVALUWriteVMEMRead -
2549 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2550 DMFMABetweenVALUWriteVMEMRead);
2551
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553 }
2554 }
2555
2556 MFMA = nullptr;
2557 WaitStatesSinceDef =
2558 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2559 if (!MFMA)
2560 continue;
2561
2562 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2563 int NumPasses = HazardDefLatency;
2564 int NeedWaitStates = MaxWaitStates;
2565
2566 if (isDGEMM(MFMA->getOpcode())) {
2567 switch (HazardDefLatency) {
2568 case 4:
2569 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570 : DMFMA4x4WriteVgprVALUReadWaitStates;
2571 break;
2572 case 8:
2573 case 16:
2574 NeedWaitStates = IsMemOrExport
2575 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576 : DMFMA16x16WriteVgprVALUReadWaitStates;
2577 break;
2578 default:
2579 llvm_unreachable("unexpected dgemm");
2580 }
2581 } else if (ST.hasGFX940Insts()) {
2582 NeedWaitStates =
2583 isXDL(ST, *MFMA)
2584 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2585 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2586 NumPasses);
2587 } else {
2588 switch (HazardDefLatency) {
2589 case 2:
2590 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591 break;
2592 case 8:
2593 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594 break;
2595 case 16:
2596 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597 break;
2598 default:
2599 llvm_unreachable("unexpected number of passes for mfma");
2600 }
2601 }
2602
2603 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2605
2606 if (WaitStatesNeeded == MaxWaitStates)
2607 break;
2608 }
2609 }
2610
2611 unsigned Opc = MI->getOpcode();
2612 const int DMFMAToFMA64WaitStates = 2;
2613 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2614 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2615 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2619 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620 }
2621
2622 if (!IsVALU && !IsMemOrExport)
2623 return WaitStatesNeeded;
2624
2625 for (const MachineOperand &Def : MI->defs()) {
2626 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2627 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2628 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2629 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2630 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2631 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2632 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2633 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2634 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2635 const int DotWriteDifferentVALUWrite = 3;
2636 const int MaxWaitStates = 19;
2637 const int MaxWarWaitStates = 15;
2638
2639 Reg = Def.getReg();
2640
2641 DOT = nullptr;
2642 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2643 MaxWaitStates);
2644 if (DOT && DOT->getOpcode() != MI->getOpcode())
2645 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2646 WaitStatesSinceDef);
2647
2648 MFMA = nullptr;
2649 WaitStatesSinceDef =
2650 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2651 if (MFMA) {
2652 int NeedWaitStates = MaxWaitStates;
2653 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2654
2655 if (isDGEMM(MFMA->getOpcode())) {
2656 switch (NumPasses) {
2657 case 4:
2658 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2659 break;
2660 case 8:
2661 case 16:
2662 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2663 break;
2664 default:
2665 llvm_unreachable("unexpected number of cycles for dgemm");
2666 }
2667 } else if (ST.hasGFX940Insts()) {
2668 NeedWaitStates =
2669 isXDL(ST, *MFMA)
2670 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2671 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2672 } else {
2673 switch (NumPasses) {
2674 case 2:
2675 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676 break;
2677 case 8:
2678 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679 break;
2680 case 16:
2681 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682 break;
2683 default:
2684 llvm_unreachable("Unexpected number of passes for mfma");
2685 }
2686 }
2687
2688 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2690
2691 if (WaitStatesNeeded == MaxWaitStates)
2692 break;
2693 }
2694
2695 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2696 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2697 !MI.readsRegister(Reg, &TRI))
2698 return false;
2699
2700 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2701 return false;
2702
2703 const MachineOperand *SrcC =
2704 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2705 assert(SrcC);
2706 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2707 return false;
2708
2709 MFMA = &MI;
2710 return true;
2711 };
2712
2713 MFMA = nullptr;
2714 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2715 MaxWarWaitStates);
2716 if (!MFMA)
2717 continue;
2718
2719 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2720 int NeedWaitStates = MaxWaitStates;
2721 switch (HazardDefLatency) {
2722 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723 break;
2724 case 4: assert(ST.hasGFX940Insts());
2725 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2726 break;
2727 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2728 break;
2729 case 16: [[fallthrough]];
2730 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2731 break;
2732 }
2733
2734 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736 }
2737
2738 return WaitStatesNeeded;
2739 }
2740
ShouldPreferAnother(SUnit * SU)2741 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2742 if (!SU->isInstr())
2743 return false;
2744
2745 const MachineInstr *MAI = nullptr;
2746
2747 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2748 MAI = nullptr;
2749 if (SIInstrInfo::isMFMA(MI))
2750 MAI = &MI;
2751 return MAI != nullptr;
2752 };
2753
2754 MachineInstr *MI = SU->getInstr();
2755 if (IsMFMAFn(*MI)) {
2756 int W = getWaitStatesSince(IsMFMAFn, 16);
2757 if (MAI)
2758 return W < (int)TSchedModel.computeInstrLatency(MAI);
2759 }
2760
2761 return false;
2762 }
2763
fixVALUMaskWriteHazard(MachineInstr * MI)2764 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2765 if (!ST.hasVALUMaskWriteHazard())
2766 return false;
2767 assert(!ST.hasExtendedWaitCounts());
2768
2769 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2770 return false;
2771
2772 // The hazard sequence is three instructions:
2773 // 1. VALU reads SGPR as mask
2774 // 2. SALU writes SGPR
2775 // 3. SALU reads SGPR
2776 // The hazard can expire if the distance between 2 and 3 is sufficient.
2777 // In practice this happens <10% of the time, hence this always assumes
2778 // the hazard exists if 1 and 2 are present to avoid searching.
2779
2780 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2781 if (!SDSTOp || !SDSTOp->isReg())
2782 return false;
2783
2784 const Register HazardReg = SDSTOp->getReg();
2785 if (HazardReg == AMDGPU::EXEC ||
2786 HazardReg == AMDGPU::EXEC_LO ||
2787 HazardReg == AMDGPU::EXEC_HI ||
2788 HazardReg == AMDGPU::M0)
2789 return false;
2790
2791 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2792 switch (I.getOpcode()) {
2793 case AMDGPU::V_ADDC_U32_e32:
2794 case AMDGPU::V_ADDC_U32_dpp:
2795 case AMDGPU::V_CNDMASK_B16_e32:
2796 case AMDGPU::V_CNDMASK_B16_dpp:
2797 case AMDGPU::V_CNDMASK_B32_e32:
2798 case AMDGPU::V_CNDMASK_B32_dpp:
2799 case AMDGPU::V_DIV_FMAS_F32_e64:
2800 case AMDGPU::V_DIV_FMAS_F64_e64:
2801 case AMDGPU::V_SUBB_U32_e32:
2802 case AMDGPU::V_SUBB_U32_dpp:
2803 case AMDGPU::V_SUBBREV_U32_e32:
2804 case AMDGPU::V_SUBBREV_U32_dpp:
2805 // These implicitly read VCC as mask source.
2806 return HazardReg == AMDGPU::VCC ||
2807 HazardReg == AMDGPU::VCC_LO ||
2808 HazardReg == AMDGPU::VCC_HI;
2809 case AMDGPU::V_ADDC_U32_e64:
2810 case AMDGPU::V_ADDC_U32_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B16_e64:
2812 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813 case AMDGPU::V_CNDMASK_B32_e64:
2814 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815 case AMDGPU::V_SUBB_U32_e64:
2816 case AMDGPU::V_SUBB_U32_e64_dpp:
2817 case AMDGPU::V_SUBBREV_U32_e64:
2818 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2819 // Only check mask register overlaps.
2820 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2821 assert(SSRCOp);
2822 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2823 }
2824 default:
2825 return false;
2826 }
2827 };
2828
2829 const MachineRegisterInfo &MRI = MF.getRegInfo();
2830 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2831 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2833 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2834 return true;
2835
2836 // VALU access to any SGPR or literal constant other than HazardReg
2837 // mitigates hazard. No need to check HazardReg here as this will
2838 // only be called when !IsHazardFn.
2839 if (!SIInstrInfo::isVALU(I))
2840 return false;
2841 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2842 const MachineOperand &Op = I.getOperand(OpNo);
2843 if (Op.isReg()) {
2844 Register OpReg = Op.getReg();
2845 // Only consider uses
2846 if (!Op.isUse())
2847 continue;
2848 // Ignore EXEC
2849 if (OpReg == AMDGPU::EXEC ||
2850 OpReg == AMDGPU::EXEC_LO ||
2851 OpReg == AMDGPU::EXEC_HI)
2852 continue;
2853 // Ignore all implicit uses except VCC
2854 if (Op.isImplicit()) {
2855 if (OpReg == AMDGPU::VCC ||
2856 OpReg == AMDGPU::VCC_LO ||
2857 OpReg == AMDGPU::VCC_HI)
2858 return true;
2859 continue;
2860 }
2861 if (TRI.isSGPRReg(MRI, OpReg))
2862 return true;
2863 } else {
2864 const MCInstrDesc &InstDesc = I.getDesc();
2865 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2866 if (!TII.isInlineConstant(Op, OpInfo))
2867 return true;
2868 }
2869 }
2870 return false;
2871 };
2872
2873 // Check for hazard
2874 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2875 std::numeric_limits<int>::max())
2876 return false;
2877
2878 auto NextMI = std::next(MI->getIterator());
2879
2880 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2882 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2883 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2884
2885 // SALU write may be s_getpc in a bundle.
2886 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887 // Update offsets of any references in the bundle.
2888 while (NextMI != MI->getParent()->end() &&
2889 NextMI->isBundledWithPred()) {
2890 for (auto &Operand : NextMI->operands()) {
2891 if (Operand.isGlobal())
2892 Operand.setOffset(Operand.getOffset() + 4);
2893 }
2894 NextMI++;
2895 }
2896 }
2897
2898 return true;
2899 }
2900
ensureEntrySetPrio(MachineFunction * MF,int Priority,const SIInstrInfo & TII)2901 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
2902 const SIInstrInfo &TII) {
2903 MachineBasicBlock &EntryMBB = MF->front();
2904 if (EntryMBB.begin() != EntryMBB.end()) {
2905 auto &EntryMI = *EntryMBB.begin();
2906 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907 EntryMI.getOperand(0).getImm() >= Priority)
2908 return false;
2909 }
2910
2911 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
2912 .addImm(Priority);
2913 return true;
2914 }
2915
fixRequiredExportPriority(MachineInstr * MI)2916 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
2917 if (!ST.hasRequiredExportPriority())
2918 return false;
2919
2920 // Assume the following shader types will never have exports,
2921 // and avoid adding or adjusting S_SETPRIO.
2922 MachineBasicBlock *MBB = MI->getParent();
2923 MachineFunction *MF = MBB->getParent();
2924 auto CC = MF->getFunction().getCallingConv();
2925 switch (CC) {
2926 case CallingConv::AMDGPU_CS:
2927 case CallingConv::AMDGPU_CS_Chain:
2928 case CallingConv::AMDGPU_CS_ChainPreserve:
2929 case CallingConv::AMDGPU_KERNEL:
2930 return false;
2931 default:
2932 break;
2933 }
2934
2935 const int MaxPriority = 3;
2936 const int NormalPriority = 2;
2937 const int PostExportPriority = 0;
2938
2939 auto It = MI->getIterator();
2940 switch (MI->getOpcode()) {
2941 case AMDGPU::S_ENDPGM:
2942 case AMDGPU::S_ENDPGM_SAVED:
2943 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944 case AMDGPU::SI_RETURN_TO_EPILOG:
2945 // Ensure shader with calls raises priority at entry.
2946 // This ensures correct priority if exports exist in callee.
2947 if (MF->getFrameInfo().hasCalls())
2948 return ensureEntrySetPrio(MF, NormalPriority, TII);
2949 return false;
2950 case AMDGPU::S_SETPRIO: {
2951 // Raise minimum priority unless in workaround.
2952 auto &PrioOp = MI->getOperand(0);
2953 int Prio = PrioOp.getImm();
2954 bool InWA = (Prio == PostExportPriority) &&
2955 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
2956 if (InWA || Prio >= NormalPriority)
2957 return false;
2958 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
2959 return true;
2960 }
2961 default:
2962 if (!TII.isEXP(*MI))
2963 return false;
2964 break;
2965 }
2966
2967 // Check entry priority at each export (as there will only be a few).
2968 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969 bool Changed = false;
2970 if (CC != CallingConv::AMDGPU_Gfx)
2971 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
2972
2973 auto NextMI = std::next(It);
2974 bool EndOfShader = false;
2975 if (NextMI != MBB->end()) {
2976 // Only need WA at end of sequence of exports.
2977 if (TII.isEXP(*NextMI))
2978 return Changed;
2979 // Assume appropriate S_SETPRIO after export means WA already applied.
2980 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
2981 NextMI->getOperand(0).getImm() == PostExportPriority)
2982 return Changed;
2983 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
2984 }
2985
2986 const DebugLoc &DL = MI->getDebugLoc();
2987
2988 // Lower priority.
2989 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
2990 .addImm(PostExportPriority);
2991
2992 if (!EndOfShader) {
2993 // Wait for exports to complete.
2994 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
2995 .addReg(AMDGPU::SGPR_NULL)
2996 .addImm(0);
2997 }
2998
2999 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3000 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3001
3002 if (!EndOfShader) {
3003 // Return to normal (higher) priority.
3004 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3005 .addImm(NormalPriority);
3006 }
3007
3008 return true;
3009 }
3010