//===- AMDGPURewriteUndefForPHI.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file implements the idea to rewrite undef incoming operand for certain // PHIs in structurized CFG. This pass only works on IR that has gone through // StructurizedCFG pass, and this pass has some additional limitation that make // it can only run after SIAnnotateControlFlow. // // To achieve optimal code generation for AMDGPU, we assume that uniformity // analysis reports the PHI in join block of divergent branch as uniform if // it has one unique uniform value plus additional undefined/poisoned incoming // value. That is to say the later compiler pipeline will ensure such PHI always // return uniform value and ensure it work correctly. Let's take a look at two // typical patterns in structured CFG that need to be taken care: (In both // patterns, block %if terminate with divergent branch.) // // Pattern A: Block with undefined incoming value dominates defined predecessor // %if // | \ // | %then // | / // %endif: %phi = phi [%undef, %if], [%uniform, %then] // // Pattern B: Block with defined incoming value dominates undefined predecessor // %if // | \ // | %then // | / // %endif: %phi = phi [%uniform, %if], [%undef, %then] // // For pattern A, by reporting %phi as uniform, the later pipeline need to make // sure it be handled correctly. The backend usually allocates a scalar register // and if any thread in a wave takes %then path, the scalar register will get // the %uniform value. // // For pattern B, we will replace the undef operand with the other defined value // in this pass. So the scalar register allocated for such PHI will get correct // liveness. Without this transformation, the scalar register may be overwritten // in the %then block. // // Limitation note: // If the join block of divergent threads is a loop header, the pass cannot // handle it correctly right now. For below case, the undef in %phi should also // be rewritten. Currently we depend on SIAnnotateControlFlow to split %header // block to get a separate join block, then we can rewrite the undef correctly. // %if // | \ // | %then // | / // -> %header: %phi = phi [%uniform, %if], [%undef, %then], [%uniform2, %header] // | | // \--- #include "AMDGPU.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "amdgpu-rewrite-undef-for-phi" namespace { class AMDGPURewriteUndefForPHI : public FunctionPass { public: static char ID; AMDGPURewriteUndefForPHI() : FunctionPass(ID) { initializeAMDGPURewriteUndefForPHIPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AMDGPU Rewrite Undef for PHI"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.setPreservesCFG(); } }; } // end anonymous namespace char AMDGPURewriteUndefForPHI::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE, "Rewrite undef for PHI", false, false) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE, "Rewrite undef for PHI", false, false) bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { bool Changed = false; SmallVector ToBeDeleted; for (auto &BB : F) { for (auto &PHI : BB.phis()) { if (UA.isDivergent(&PHI)) continue; // The unique incoming value except undef/poison for the PHI node. Value *UniqueDefinedIncoming = nullptr; // The divergent block with defined incoming value that dominates all // other block with the same incoming value. BasicBlock *DominateBB = nullptr; // Predecessors with undefined incoming value (excluding loop backedge). SmallVector Undefs; for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { Value *Incoming = PHI.getIncomingValue(i); BasicBlock *IncomingBB = PHI.getIncomingBlock(i); if (Incoming == &PHI) continue; if (isa(Incoming)) { // Undef from loop backedge will not be replaced. if (!DT->dominates(&BB, IncomingBB)) Undefs.push_back(IncomingBB); continue; } if (!UniqueDefinedIncoming) { UniqueDefinedIncoming = Incoming; DominateBB = IncomingBB; } else if (Incoming == UniqueDefinedIncoming) { // Update DominateBB if necessary. if (DT->dominates(IncomingBB, DominateBB)) DominateBB = IncomingBB; } else { UniqueDefinedIncoming = nullptr; break; } } // We only need to replace the undef for the PHI which is merging // defined/undefined values from divergent threads. // TODO: We should still be able to replace undef value if the unique // value is a Constant. if (!UniqueDefinedIncoming || Undefs.empty() || !UA.isDivergent(DominateBB->getTerminator())) continue; // We only replace the undef when DominateBB truly dominates all the // other predecessors with undefined incoming value. Make sure DominateBB // dominates BB so that UniqueDefinedIncoming is available in BB and // afterwards. if (DT->dominates(DominateBB, &BB) && all_of(Undefs, [&](BasicBlock *UD) { return DT->dominates(DominateBB, UD); })) { PHI.replaceAllUsesWith(UniqueDefinedIncoming); ToBeDeleted.push_back(&PHI); Changed = true; } } } for (auto *PHI : ToBeDeleted) PHI->eraseFromParent(); return Changed; } bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) { UniformityInfo &UA = getAnalysis().getUniformityInfo(); DominatorTree *DT = &getAnalysis().getDomTree(); return rewritePHIs(F, UA, DT); } FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() { return new AMDGPURewriteUndefForPHI(); }