xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp (revision 1342eb5a832fa10e689a29faab3acb6054e4778c)
1 //===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert s_clause instructions to form hard clauses.
11 ///
12 /// Clausing load instructions can give cache coherency benefits. Before gfx10,
13 /// the hardware automatically detected "soft clauses", which were sequences of
14 /// memory instructions of the same type. In gfx10 this detection was removed,
15 /// and the s_clause instruction was introduced to explicitly mark "hard
16 /// clauses".
17 ///
18 /// It's the scheduler's job to form the clauses by putting similar memory
19 /// instructions next to each other. Our job is just to insert an s_clause
20 /// instruction to mark the start of each clause.
21 ///
22 /// Note that hard clauses are very similar to, but logically distinct from, the
23 /// groups of instructions that have to be restartable when XNACK is enabled.
24 /// The rules are slightly different in each case. For example an s_nop
25 /// instruction breaks a restartable group, but can appear in the middle of a
26 /// hard clause. (Before gfx10 there wasn't a distinction, and both were called
27 /// "soft clauses" or just "clauses".)
28 ///
29 /// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable
30 /// groups, not hard clauses.
31 //
32 //===----------------------------------------------------------------------===//
33 
34 #include "AMDGPU.h"
35 #include "GCNSubtarget.h"
36 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
37 #include "llvm/ADT/SmallVector.h"
38 #include "llvm/CodeGen/MachineFunctionPass.h"
39 #include "llvm/CodeGen/MachinePassManager.h"
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "si-insert-hard-clauses"
44 
45 static cl::opt<unsigned>
46     HardClauseLengthLimit("amdgpu-hard-clause-length-limit",
47                           cl::desc("Maximum number of memory instructions to "
48                                    "place in the same hard clause"),
49                           cl::Hidden);
50 
51 namespace {
52 
53 enum HardClauseType {
54   // For GFX10:
55 
56   // Texture, buffer, global or scratch memory instructions.
57   HARDCLAUSE_VMEM,
58   // Flat (not global or scratch) memory instructions.
59   HARDCLAUSE_FLAT,
60 
61   // For GFX11:
62 
63   // Texture memory instructions.
64   HARDCLAUSE_MIMG_LOAD,
65   HARDCLAUSE_MIMG_STORE,
66   HARDCLAUSE_MIMG_ATOMIC,
67   HARDCLAUSE_MIMG_SAMPLE,
68   // Buffer, global or scratch memory instructions.
69   HARDCLAUSE_VMEM_LOAD,
70   HARDCLAUSE_VMEM_STORE,
71   HARDCLAUSE_VMEM_ATOMIC,
72   // Flat (not global or scratch) memory instructions.
73   HARDCLAUSE_FLAT_LOAD,
74   HARDCLAUSE_FLAT_STORE,
75   HARDCLAUSE_FLAT_ATOMIC,
76   // BVH instructions.
77   HARDCLAUSE_BVH,
78 
79   // Common:
80 
81   // Instructions that access LDS.
82   HARDCLAUSE_LDS,
83   // Scalar memory instructions.
84   HARDCLAUSE_SMEM,
85   // VALU instructions.
86   HARDCLAUSE_VALU,
87   LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU,
88 
89   // Internal instructions, which are allowed in the middle of a hard clause,
90   // except for s_waitcnt.
91   HARDCLAUSE_INTERNAL,
92   // Meta instructions that do not result in any ISA like KILL.
93   HARDCLAUSE_IGNORE,
94   // Instructions that are not allowed in a hard clause: SALU, export, branch,
95   // message, GDS, s_waitcnt and anything else not mentioned above.
96   HARDCLAUSE_ILLEGAL,
97 };
98 
99 class SIInsertHardClauses {
100 public:
101   const GCNSubtarget *ST = nullptr;
102 
103   HardClauseType getHardClauseType(const MachineInstr &MI) {
104     if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
105       if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
106         if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
107             SIInstrInfo::isSegmentSpecificFLAT(MI)) {
108           if (ST->hasNSAClauseBug()) {
109             const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
110             if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
111               return HARDCLAUSE_ILLEGAL;
112           }
113           return HARDCLAUSE_VMEM;
114         }
115         if (SIInstrInfo::isFLAT(MI))
116           return HARDCLAUSE_FLAT;
117       } else {
118         assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11);
119         if (SIInstrInfo::isMIMG(MI)) {
120           const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
121           const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
122               AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
123           if (BaseInfo->BVH)
124             return HARDCLAUSE_BVH;
125           if (BaseInfo->Sampler || BaseInfo->MSAA)
126             return HARDCLAUSE_MIMG_SAMPLE;
127           return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
128                                               : HARDCLAUSE_MIMG_LOAD
129                               : HARDCLAUSE_MIMG_STORE;
130         }
131         if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
132             SIInstrInfo::isSegmentSpecificFLAT(MI)) {
133           return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC
134                                               : HARDCLAUSE_VMEM_LOAD
135                               : HARDCLAUSE_VMEM_STORE;
136         }
137         if (SIInstrInfo::isFLAT(MI)) {
138           return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC
139                                               : HARDCLAUSE_FLAT_LOAD
140                               : HARDCLAUSE_FLAT_STORE;
141         }
142       }
143       // TODO: LDS
144       if (SIInstrInfo::isSMRD(MI))
145         return HARDCLAUSE_SMEM;
146     }
147 
148     // Don't form VALU clauses. It's not clear what benefit they give, if any.
149 
150     // In practice s_nop is the only internal instruction we're likely to see.
151     // It's safe to treat the rest as illegal.
152     if (MI.getOpcode() == AMDGPU::S_NOP)
153       return HARDCLAUSE_INTERNAL;
154     if (MI.isMetaInstruction())
155       return HARDCLAUSE_IGNORE;
156     return HARDCLAUSE_ILLEGAL;
157   }
158 
159   // Track information about a clause as we discover it.
160   struct ClauseInfo {
161     // The type of all (non-internal) instructions in the clause.
162     HardClauseType Type = HARDCLAUSE_ILLEGAL;
163     // The first (necessarily non-internal) instruction in the clause.
164     MachineInstr *First = nullptr;
165     // The last non-internal instruction in the clause.
166     MachineInstr *Last = nullptr;
167     // The length of the clause including any internal instructions in the
168     // middle (but not at the end) of the clause.
169     unsigned Length = 0;
170     // Internal instructions at the and of a clause should not be included in
171     // the clause. Count them in TrailingInternalLength until a new memory
172     // instruction is added.
173     unsigned TrailingInternalLength = 0;
174     // The base operands of *Last.
175     SmallVector<const MachineOperand *, 4> BaseOps;
176   };
177 
178   bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
179     if (CI.First == CI.Last)
180       return false;
181     assert(CI.Length <= ST->maxHardClauseLength() &&
182            "Hard clause is too long!");
183 
184     auto &MBB = *CI.First->getParent();
185     auto ClauseMI =
186         BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
187             .addImm(CI.Length - 1);
188     finalizeBundle(MBB, ClauseMI->getIterator(),
189                    std::next(CI.Last->getIterator()));
190     return true;
191   }
192 
193   bool run(MachineFunction &MF) {
194     ST = &MF.getSubtarget<GCNSubtarget>();
195     if (!ST->hasHardClauses())
196       return false;
197 
198     unsigned MaxClauseLength = MF.getFunction().getFnAttributeAsParsedInteger(
199         "amdgpu-hard-clause-length-limit", 255);
200     if (HardClauseLengthLimit.getNumOccurrences())
201       MaxClauseLength = HardClauseLengthLimit;
202     MaxClauseLength = std::min(MaxClauseLength, ST->maxHardClauseLength());
203     if (MaxClauseLength <= 1)
204       return false;
205 
206     const SIInstrInfo *SII = ST->getInstrInfo();
207     const TargetRegisterInfo *TRI = ST->getRegisterInfo();
208 
209     bool Changed = false;
210     for (auto &MBB : MF) {
211       ClauseInfo CI;
212       for (auto &MI : MBB) {
213         HardClauseType Type = getHardClauseType(MI);
214 
215         int64_t Dummy1;
216         bool Dummy2;
217         LocationSize Dummy3 = LocationSize::precise(0);
218         SmallVector<const MachineOperand *, 4> BaseOps;
219         if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
220           if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2,
221                                                   Dummy3, TRI)) {
222             // We failed to get the base operands, so we'll never clause this
223             // instruction with any other, so pretend it's illegal.
224             Type = HARDCLAUSE_ILLEGAL;
225           }
226         }
227 
228         if (CI.Length == MaxClauseLength ||
229             (CI.Length && Type != HARDCLAUSE_INTERNAL &&
230              Type != HARDCLAUSE_IGNORE &&
231              (Type != CI.Type ||
232               // Note that we lie to shouldClusterMemOps about the size of the
233               // cluster. When shouldClusterMemOps is called from the machine
234               // scheduler it limits the size of the cluster to avoid increasing
235               // register pressure too much, but this pass runs after register
236               // allocation so there is no need for that kind of limit.
237               // We also lie about the Offset and OffsetIsScalable parameters,
238               // as they aren't used in the SIInstrInfo implementation.
239               !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
240                                         2, 2)))) {
241           // Finish the current clause.
242           Changed |= emitClause(CI, SII);
243           CI = ClauseInfo();
244         }
245 
246         if (CI.Length) {
247           // Extend the current clause.
248           if (Type != HARDCLAUSE_IGNORE) {
249             if (Type == HARDCLAUSE_INTERNAL) {
250               ++CI.TrailingInternalLength;
251             } else {
252               ++CI.Length;
253               CI.Length += CI.TrailingInternalLength;
254               CI.TrailingInternalLength = 0;
255               CI.Last = &MI;
256               CI.BaseOps = std::move(BaseOps);
257             }
258           }
259         } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
260           // Start a new clause.
261           CI = ClauseInfo{Type, &MI, &MI, 1, 0, std::move(BaseOps)};
262         }
263       }
264 
265       // Finish the last clause in the basic block if any.
266       if (CI.Length)
267         Changed |= emitClause(CI, SII);
268     }
269 
270     return Changed;
271   }
272 };
273 
274 class SIInsertHardClausesLegacy : public MachineFunctionPass {
275 public:
276   static char ID;
277   SIInsertHardClausesLegacy() : MachineFunctionPass(ID) {}
278 
279   bool runOnMachineFunction(MachineFunction &MF) override {
280     if (skipFunction(MF.getFunction()))
281       return false;
282 
283     return SIInsertHardClauses().run(MF);
284   }
285 
286   void getAnalysisUsage(AnalysisUsage &AU) const override {
287     AU.setPreservesCFG();
288     MachineFunctionPass::getAnalysisUsage(AU);
289   }
290 };
291 
292 } // namespace
293 
294 PreservedAnalyses
295 llvm::SIInsertHardClausesPass::run(MachineFunction &MF,
296                                    MachineFunctionAnalysisManager &MFAM) {
297   if (!SIInsertHardClauses().run(MF))
298     return PreservedAnalyses::all();
299 
300   auto PA = getMachineFunctionPassPreservedAnalyses();
301   PA.preserveSet<CFGAnalyses>();
302   return PA;
303 }
304 
305 char SIInsertHardClausesLegacy::ID = 0;
306 
307 char &llvm::SIInsertHardClausesID = SIInsertHardClausesLegacy::ID;
308 
309 INITIALIZE_PASS(SIInsertHardClausesLegacy, DEBUG_TYPE, "SI Insert Hard Clauses",
310                 false, false)
311