xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp (revision 77013d11e6483b970af25e13c9b892075742f7e5)
1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "SIDefines.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Analysis/DomTreeUpdater.h"
29 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
30 #include "llvm/Analysis/PostDominators.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/IntrinsicsAMDGPU.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/InitializePasses.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Utils.h"
48 #include "llvm/Transforms/Utils/Local.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
53 
54 namespace {
55 
56 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
57 public:
58   static char ID; // Pass identification, replacement for typeid
59 
60   AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
61     initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
62   }
63 
64   // We can preserve non-critical-edgeness when we unify function exit nodes
65   void getAnalysisUsage(AnalysisUsage &AU) const override;
66   bool runOnFunction(Function &F) override;
67 };
68 
69 } // end anonymous namespace
70 
71 char AMDGPUUnifyDivergentExitNodes::ID = 0;
72 
73 char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
74 
75 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
76                      "Unify divergent function exit nodes", false, false)
77 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
78 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
79 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
80 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
81                     "Unify divergent function exit nodes", false, false)
82 
83 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
84   if (RequireAndPreserveDomTree)
85     AU.addRequired<DominatorTreeWrapperPass>();
86 
87   AU.addRequired<PostDominatorTreeWrapperPass>();
88 
89   AU.addRequired<LegacyDivergenceAnalysis>();
90 
91   if (RequireAndPreserveDomTree) {
92     AU.addPreserved<DominatorTreeWrapperPass>();
93     // FIXME: preserve PostDominatorTreeWrapperPass
94   }
95 
96   // No divergent values are changed, only blocks and branch edges.
97   AU.addPreserved<LegacyDivergenceAnalysis>();
98 
99   // We preserve the non-critical-edgeness property
100   AU.addPreservedID(BreakCriticalEdgesID);
101 
102   // This is a cluster of orthogonal Transforms
103   AU.addPreservedID(LowerSwitchID);
104   FunctionPass::getAnalysisUsage(AU);
105 
106   AU.addRequired<TargetTransformInfoWrapperPass>();
107 }
108 
109 /// \returns true if \p BB is reachable through only uniform branches.
110 /// XXX - Is there a more efficient way to find this?
111 static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
112                                BasicBlock &BB) {
113   SmallVector<BasicBlock *, 8> Stack;
114   SmallPtrSet<BasicBlock *, 8> Visited;
115 
116   for (BasicBlock *Pred : predecessors(&BB))
117     Stack.push_back(Pred);
118 
119   while (!Stack.empty()) {
120     BasicBlock *Top = Stack.pop_back_val();
121     if (!DA.isUniform(Top->getTerminator()))
122       return false;
123 
124     for (BasicBlock *Pred : predecessors(Top)) {
125       if (Visited.insert(Pred).second)
126         Stack.push_back(Pred);
127     }
128   }
129 
130   return true;
131 }
132 
133 static void removeDoneExport(Function &F) {
134   ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
135   for (BasicBlock &BB : F) {
136     for (Instruction &I : BB) {
137       if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
138         if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
139           Intrin->setArgOperand(6, BoolFalse); // done
140         } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
141           Intrin->setArgOperand(4, BoolFalse); // done
142         }
143       }
144     }
145   }
146 }
147 
148 static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
149                                        ArrayRef<BasicBlock *> ReturningBlocks,
150                                        bool InsertExport,
151                                        const TargetTransformInfo &TTI,
152                                        StringRef Name) {
153   // Otherwise, we need to insert a new basic block into the function, add a PHI
154   // nodes (if the function returns values), and convert all of the return
155   // instructions into unconditional branches.
156   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
157   IRBuilder<> B(NewRetBlock);
158 
159   if (InsertExport) {
160     // Ensure that there's only one "done" export in the shader by removing the
161     // "done" bit set on the original final export. More than one "done" export
162     // can lead to undefined behavior.
163     removeDoneExport(F);
164 
165     Value *Undef = UndefValue::get(B.getFloatTy());
166     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
167                       {
168                         B.getInt32(AMDGPU::Exp::ET_NULL),
169                         B.getInt32(0), // enabled channels
170                         Undef, Undef, Undef, Undef, // values
171                         B.getTrue(), // done
172                         B.getTrue(), // valid mask
173                       });
174   }
175 
176   PHINode *PN = nullptr;
177   if (F.getReturnType()->isVoidTy()) {
178     B.CreateRetVoid();
179   } else {
180     // If the function doesn't return void... add a PHI node to the block...
181     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
182                      "UnifiedRetVal");
183     assert(!InsertExport);
184     B.CreateRet(PN);
185   }
186 
187   // Loop over all of the blocks, replacing the return instruction with an
188   // unconditional branch.
189   std::vector<DominatorTree::UpdateType> Updates;
190   Updates.reserve(ReturningBlocks.size());
191   for (BasicBlock *BB : ReturningBlocks) {
192     // Add an incoming element to the PHI node for every return instruction that
193     // is merging into this new block...
194     if (PN)
195       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
196 
197     // Remove and delete the return inst.
198     BB->getTerminator()->eraseFromParent();
199     BranchInst::Create(NewRetBlock, BB);
200     Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
201   }
202 
203   if (RequireAndPreserveDomTree)
204     DTU.applyUpdates(Updates);
205   Updates.clear();
206 
207   for (BasicBlock *BB : ReturningBlocks) {
208     // Cleanup possible branch to unconditional branch to the return.
209     simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
210                 SimplifyCFGOptions().bonusInstThreshold(2));
211   }
212 
213   return NewRetBlock;
214 }
215 
216 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
217   DominatorTree *DT = nullptr;
218   if (RequireAndPreserveDomTree)
219     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
220 
221   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
222 
223   // If there's only one exit, we don't need to do anything, unless this is a
224   // pixel shader and that exit is an infinite loop, since we still have to
225   // insert an export in that case.
226   if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
227     return false;
228 
229   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
230 
231   // Loop over all of the blocks in a function, tracking all of the blocks that
232   // return.
233   SmallVector<BasicBlock *, 4> ReturningBlocks;
234   SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
235   SmallVector<BasicBlock *, 4> UnreachableBlocks;
236 
237   // Dummy return block for infinite loop.
238   BasicBlock *DummyReturnBB = nullptr;
239 
240   bool InsertExport = false;
241 
242   bool Changed = false;
243   std::vector<DominatorTree::UpdateType> Updates;
244 
245   for (BasicBlock *BB : PDT.roots()) {
246     if (isa<ReturnInst>(BB->getTerminator())) {
247       if (!isUniformlyReached(DA, *BB))
248         ReturningBlocks.push_back(BB);
249       else
250         UniformlyReachedRetBlocks.push_back(BB);
251     } else if (isa<UnreachableInst>(BB->getTerminator())) {
252       if (!isUniformlyReached(DA, *BB))
253         UnreachableBlocks.push_back(BB);
254     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
255 
256       ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
257       if (DummyReturnBB == nullptr) {
258         DummyReturnBB = BasicBlock::Create(F.getContext(),
259                                            "DummyReturnBlock", &F);
260         Type *RetTy = F.getReturnType();
261         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
262 
263         // For pixel shaders, the producer guarantees that an export is
264         // executed before each return instruction. However, if there is an
265         // infinite loop and we insert a return ourselves, we need to uphold
266         // that guarantee by inserting a null export. This can happen e.g. in
267         // an infinite loop with kill instructions, which is supposed to
268         // terminate. However, we don't need to do this if there is a non-void
269         // return value, since then there is an epilog afterwards which will
270         // still export.
271         //
272         // Note: In the case where only some threads enter the infinite loop,
273         // this can result in the null export happening redundantly after the
274         // original exports. However, The last "real" export happens after all
275         // the threads that didn't enter an infinite loop converged, which
276         // means that the only extra threads to execute the null export are
277         // threads that entered the infinite loop, and they only could've
278         // exited through being killed which sets their exec bit to 0.
279         // Therefore, unless there's an actual infinite loop, which can have
280         // invalid results, or there's a kill after the last export, which we
281         // assume the frontend won't do, this export will have the same exec
282         // mask as the last "real" export, and therefore the valid mask will be
283         // overwritten with the same value and will still be correct. Also,
284         // even though this forces an extra unnecessary export wait, we assume
285         // that this happens rare enough in practice to that we don't have to
286         // worry about performance.
287         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
288             RetTy->isVoidTy()) {
289           InsertExport = true;
290         }
291 
292         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
293         ReturningBlocks.push_back(DummyReturnBB);
294       }
295 
296       if (BI->isUnconditional()) {
297         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
298         BI->eraseFromParent(); // Delete the unconditional branch.
299         // Add a new conditional branch with a dummy edge to the return block.
300         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
301         Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
302       } else { // Conditional branch.
303         SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
304 
305         // Create a new transition block to hold the conditional branch.
306         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
307 
308         Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
309 
310         // 'Successors' become successors of TransitionBB instead of BB,
311         // and TransitionBB becomes a single successor of BB.
312         Updates.push_back({DominatorTree::Insert, BB, TransitionBB});
313         for (BasicBlock *Successor : Successors) {
314           Updates.push_back({DominatorTree::Insert, TransitionBB, Successor});
315           Updates.push_back({DominatorTree::Delete, BB, Successor});
316         }
317 
318         // Create a branch that will always branch to the transition block and
319         // references DummyReturnBB.
320         BB->getTerminator()->eraseFromParent();
321         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
322         Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
323       }
324       Changed = true;
325     }
326   }
327 
328   if (!UnreachableBlocks.empty()) {
329     BasicBlock *UnreachableBlock = nullptr;
330 
331     if (UnreachableBlocks.size() == 1) {
332       UnreachableBlock = UnreachableBlocks.front();
333     } else {
334       UnreachableBlock = BasicBlock::Create(F.getContext(),
335                                             "UnifiedUnreachableBlock", &F);
336       new UnreachableInst(F.getContext(), UnreachableBlock);
337 
338       Updates.reserve(Updates.size() + UnreachableBlocks.size());
339       for (BasicBlock *BB : UnreachableBlocks) {
340         // Remove and delete the unreachable inst.
341         BB->getTerminator()->eraseFromParent();
342         BranchInst::Create(UnreachableBlock, BB);
343         Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock});
344       }
345       Changed = true;
346     }
347 
348     if (!ReturningBlocks.empty()) {
349       // Don't create a new unreachable inst if we have a return. The
350       // structurizer/annotator can't handle the multiple exits
351 
352       Type *RetTy = F.getReturnType();
353       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
354       // Remove and delete the unreachable inst.
355       UnreachableBlock->getTerminator()->eraseFromParent();
356 
357       Function *UnreachableIntrin =
358         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
359 
360       // Insert a call to an intrinsic tracking that this is an unreachable
361       // point, in case we want to kill the active lanes or something later.
362       CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
363 
364       // Don't create a scalar trap. We would only want to trap if this code was
365       // really reached, but a scalar trap would happen even if no lanes
366       // actually reached here.
367       ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
368       ReturningBlocks.push_back(UnreachableBlock);
369       Changed = true;
370     }
371   }
372 
373   // FIXME: add PDT here once simplifycfg is ready.
374   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
375   if (RequireAndPreserveDomTree)
376     DTU.applyUpdates(Updates);
377   Updates.clear();
378 
379   // Now handle return blocks.
380   if (ReturningBlocks.empty())
381     return Changed; // No blocks return
382 
383   if (ReturningBlocks.size() == 1 && !InsertExport)
384     return Changed; // Already has a single return block
385 
386   const TargetTransformInfo &TTI
387     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
388 
389   // Unify returning blocks. If we are going to insert the export it is also
390   // necessary to include blocks that are uniformly reached, because in addition
391   // to inserting the export the "done" bits on existing exports will be cleared
392   // and we do not want to end up with the normal export in a non-unified,
393   // uniformly reached block with the "done" bit cleared.
394   auto BlocksToUnify = std::move(ReturningBlocks);
395   if (InsertExport) {
396     llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
397   }
398 
399   unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
400                       "UnifiedReturnBlock");
401   return true;
402 }
403