10b57cec5SDimitry Andric //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass optimizes atomic operations by using a single lane of a wavefront 110b57cec5SDimitry Andric /// to perform the atomic operation, thus reducing contention on that memory 120b57cec5SDimitry Andric /// location. 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 170b57cec5SDimitry Andric #include "AMDGPUSubtarget.h" 18*8bcb0991SDimitry Andric #include "SIDefines.h" 190b57cec5SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 200b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 210b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 220b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 230b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 240b57cec5SDimitry Andric 250b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-atomic-optimizer" 260b57cec5SDimitry Andric 270b57cec5SDimitry Andric using namespace llvm; 28*8bcb0991SDimitry Andric using namespace llvm::AMDGPU; 290b57cec5SDimitry Andric 300b57cec5SDimitry Andric namespace { 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric struct ReplacementInfo { 330b57cec5SDimitry Andric Instruction *I; 340b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 350b57cec5SDimitry Andric unsigned ValIdx; 360b57cec5SDimitry Andric bool ValDivergent; 370b57cec5SDimitry Andric }; 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric class AMDGPUAtomicOptimizer : public FunctionPass, 400b57cec5SDimitry Andric public InstVisitor<AMDGPUAtomicOptimizer> { 410b57cec5SDimitry Andric private: 420b57cec5SDimitry Andric SmallVector<ReplacementInfo, 8> ToReplace; 430b57cec5SDimitry Andric const LegacyDivergenceAnalysis *DA; 440b57cec5SDimitry Andric const DataLayout *DL; 450b57cec5SDimitry Andric DominatorTree *DT; 46*8bcb0991SDimitry Andric const GCNSubtarget *ST; 470b57cec5SDimitry Andric bool IsPixelShader; 480b57cec5SDimitry Andric 49*8bcb0991SDimitry Andric Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 50*8bcb0991SDimitry Andric Value *const Identity) const; 51*8bcb0991SDimitry Andric Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 520b57cec5SDimitry Andric void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 530b57cec5SDimitry Andric bool ValDivergent) const; 540b57cec5SDimitry Andric 550b57cec5SDimitry Andric public: 560b57cec5SDimitry Andric static char ID; 570b57cec5SDimitry Andric 580b57cec5SDimitry Andric AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 590b57cec5SDimitry Andric 600b57cec5SDimitry Andric bool runOnFunction(Function &F) override; 610b57cec5SDimitry Andric 620b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 630b57cec5SDimitry Andric AU.addPreserved<DominatorTreeWrapperPass>(); 640b57cec5SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 650b57cec5SDimitry Andric AU.addRequired<TargetPassConfig>(); 660b57cec5SDimitry Andric } 670b57cec5SDimitry Andric 680b57cec5SDimitry Andric void visitAtomicRMWInst(AtomicRMWInst &I); 690b57cec5SDimitry Andric void visitIntrinsicInst(IntrinsicInst &I); 700b57cec5SDimitry Andric }; 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric } // namespace 730b57cec5SDimitry Andric 740b57cec5SDimitry Andric char AMDGPUAtomicOptimizer::ID = 0; 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 790b57cec5SDimitry Andric if (skipFunction(F)) { 800b57cec5SDimitry Andric return false; 810b57cec5SDimitry Andric } 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 840b57cec5SDimitry Andric DL = &F.getParent()->getDataLayout(); 850b57cec5SDimitry Andric DominatorTreeWrapperPass *const DTW = 860b57cec5SDimitry Andric getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 870b57cec5SDimitry Andric DT = DTW ? &DTW->getDomTree() : nullptr; 880b57cec5SDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 890b57cec5SDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>(); 90*8bcb0991SDimitry Andric ST = &TM.getSubtarget<GCNSubtarget>(F); 910b57cec5SDimitry Andric IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric visit(F); 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric const bool Changed = !ToReplace.empty(); 960b57cec5SDimitry Andric 970b57cec5SDimitry Andric for (ReplacementInfo &Info : ToReplace) { 980b57cec5SDimitry Andric optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 990b57cec5SDimitry Andric } 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric ToReplace.clear(); 1020b57cec5SDimitry Andric 1030b57cec5SDimitry Andric return Changed; 1040b57cec5SDimitry Andric } 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 1070b57cec5SDimitry Andric // Early exit for unhandled address space atomic instructions. 1080b57cec5SDimitry Andric switch (I.getPointerAddressSpace()) { 1090b57cec5SDimitry Andric default: 1100b57cec5SDimitry Andric return; 1110b57cec5SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 1120b57cec5SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 1130b57cec5SDimitry Andric break; 1140b57cec5SDimitry Andric } 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric AtomicRMWInst::BinOp Op = I.getOperation(); 1170b57cec5SDimitry Andric 1180b57cec5SDimitry Andric switch (Op) { 1190b57cec5SDimitry Andric default: 1200b57cec5SDimitry Andric return; 1210b57cec5SDimitry Andric case AtomicRMWInst::Add: 1220b57cec5SDimitry Andric case AtomicRMWInst::Sub: 1230b57cec5SDimitry Andric case AtomicRMWInst::And: 1240b57cec5SDimitry Andric case AtomicRMWInst::Or: 1250b57cec5SDimitry Andric case AtomicRMWInst::Xor: 1260b57cec5SDimitry Andric case AtomicRMWInst::Max: 1270b57cec5SDimitry Andric case AtomicRMWInst::Min: 1280b57cec5SDimitry Andric case AtomicRMWInst::UMax: 1290b57cec5SDimitry Andric case AtomicRMWInst::UMin: 1300b57cec5SDimitry Andric break; 1310b57cec5SDimitry Andric } 1320b57cec5SDimitry Andric 1330b57cec5SDimitry Andric const unsigned PtrIdx = 0; 1340b57cec5SDimitry Andric const unsigned ValIdx = 1; 1350b57cec5SDimitry Andric 1360b57cec5SDimitry Andric // If the pointer operand is divergent, then each lane is doing an atomic 1370b57cec5SDimitry Andric // operation on a different address, and we cannot optimize that. 138*8bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { 1390b57cec5SDimitry Andric return; 1400b57cec5SDimitry Andric } 1410b57cec5SDimitry Andric 142*8bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 1450b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 1460b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 1470b57cec5SDimitry Andric // bits. 148*8bcb0991SDimitry Andric if (ValDivergent && 149*8bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 1500b57cec5SDimitry Andric return; 1510b57cec5SDimitry Andric } 1520b57cec5SDimitry Andric 1530b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 1540b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 1550b57cec5SDimitry Andric // remember the instruction so we can come back to it. 1560b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 1570b57cec5SDimitry Andric 1580b57cec5SDimitry Andric ToReplace.push_back(Info); 1590b57cec5SDimitry Andric } 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 1620b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 1650b57cec5SDimitry Andric default: 1660b57cec5SDimitry Andric return; 1670b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_add: 1680b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 1690b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 1700b57cec5SDimitry Andric Op = AtomicRMWInst::Add; 1710b57cec5SDimitry Andric break; 1720b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_sub: 1730b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 1740b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 1750b57cec5SDimitry Andric Op = AtomicRMWInst::Sub; 1760b57cec5SDimitry Andric break; 1770b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_and: 1780b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 1790b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 1800b57cec5SDimitry Andric Op = AtomicRMWInst::And; 1810b57cec5SDimitry Andric break; 1820b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_or: 1830b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 1840b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 1850b57cec5SDimitry Andric Op = AtomicRMWInst::Or; 1860b57cec5SDimitry Andric break; 1870b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_xor: 1880b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 1890b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 1900b57cec5SDimitry Andric Op = AtomicRMWInst::Xor; 1910b57cec5SDimitry Andric break; 1920b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smin: 1930b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 1940b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 1950b57cec5SDimitry Andric Op = AtomicRMWInst::Min; 1960b57cec5SDimitry Andric break; 1970b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umin: 1980b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 1990b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2000b57cec5SDimitry Andric Op = AtomicRMWInst::UMin; 2010b57cec5SDimitry Andric break; 2020b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smax: 2030b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2040b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2050b57cec5SDimitry Andric Op = AtomicRMWInst::Max; 2060b57cec5SDimitry Andric break; 2070b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umax: 2080b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2090b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2100b57cec5SDimitry Andric Op = AtomicRMWInst::UMax; 2110b57cec5SDimitry Andric break; 2120b57cec5SDimitry Andric } 2130b57cec5SDimitry Andric 2140b57cec5SDimitry Andric const unsigned ValIdx = 0; 2150b57cec5SDimitry Andric 216*8bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 2170b57cec5SDimitry Andric 2180b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 2190b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 2200b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 2210b57cec5SDimitry Andric // bits. 222*8bcb0991SDimitry Andric if (ValDivergent && 223*8bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 2240b57cec5SDimitry Andric return; 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric // If any of the other arguments to the intrinsic are divergent, we can't 2280b57cec5SDimitry Andric // optimize the operation. 2290b57cec5SDimitry Andric for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 230*8bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(Idx))) { 2310b57cec5SDimitry Andric return; 2320b57cec5SDimitry Andric } 2330b57cec5SDimitry Andric } 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 2360b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 2370b57cec5SDimitry Andric // remember the instruction so we can come back to it. 2380b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 2390b57cec5SDimitry Andric 2400b57cec5SDimitry Andric ToReplace.push_back(Info); 2410b57cec5SDimitry Andric } 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric // Use the builder to create the non-atomic counterpart of the specified 2440b57cec5SDimitry Andric // atomicrmw binary op. 2450b57cec5SDimitry Andric static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 2460b57cec5SDimitry Andric Value *LHS, Value *RHS) { 2470b57cec5SDimitry Andric CmpInst::Predicate Pred; 2480b57cec5SDimitry Andric 2490b57cec5SDimitry Andric switch (Op) { 2500b57cec5SDimitry Andric default: 2510b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 2520b57cec5SDimitry Andric case AtomicRMWInst::Add: 2530b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Add, LHS, RHS); 2540b57cec5SDimitry Andric case AtomicRMWInst::Sub: 2550b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Sub, LHS, RHS); 2560b57cec5SDimitry Andric case AtomicRMWInst::And: 2570b57cec5SDimitry Andric return B.CreateBinOp(Instruction::And, LHS, RHS); 2580b57cec5SDimitry Andric case AtomicRMWInst::Or: 2590b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Or, LHS, RHS); 2600b57cec5SDimitry Andric case AtomicRMWInst::Xor: 2610b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Xor, LHS, RHS); 2620b57cec5SDimitry Andric 2630b57cec5SDimitry Andric case AtomicRMWInst::Max: 2640b57cec5SDimitry Andric Pred = CmpInst::ICMP_SGT; 2650b57cec5SDimitry Andric break; 2660b57cec5SDimitry Andric case AtomicRMWInst::Min: 2670b57cec5SDimitry Andric Pred = CmpInst::ICMP_SLT; 2680b57cec5SDimitry Andric break; 2690b57cec5SDimitry Andric case AtomicRMWInst::UMax: 2700b57cec5SDimitry Andric Pred = CmpInst::ICMP_UGT; 2710b57cec5SDimitry Andric break; 2720b57cec5SDimitry Andric case AtomicRMWInst::UMin: 2730b57cec5SDimitry Andric Pred = CmpInst::ICMP_ULT; 2740b57cec5SDimitry Andric break; 2750b57cec5SDimitry Andric } 2760b57cec5SDimitry Andric Value *Cond = B.CreateICmp(Pred, LHS, RHS); 2770b57cec5SDimitry Andric return B.CreateSelect(Cond, LHS, RHS); 2780b57cec5SDimitry Andric } 2790b57cec5SDimitry Andric 280*8bcb0991SDimitry Andric // Use the builder to create an inclusive scan of V across the wavefront, with 281*8bcb0991SDimitry Andric // all lanes active. 282*8bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 283*8bcb0991SDimitry Andric Value *V, Value *const Identity) const { 284*8bcb0991SDimitry Andric Type *const Ty = V->getType(); 285*8bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 286*8bcb0991SDimitry Andric Function *UpdateDPP = 287*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 288*8bcb0991SDimitry Andric Function *PermLaneX16 = 289*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); 290*8bcb0991SDimitry Andric Function *ReadLane = 291*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 292*8bcb0991SDimitry Andric 293*8bcb0991SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 294*8bcb0991SDimitry Andric V = buildNonAtomicBinOp( 295*8bcb0991SDimitry Andric B, Op, V, 296*8bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 297*8bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 298*8bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 299*8bcb0991SDimitry Andric } 300*8bcb0991SDimitry Andric if (ST->hasDPPBroadcasts()) { 301*8bcb0991SDimitry Andric // GFX9 has DPP row broadcast operations. 302*8bcb0991SDimitry Andric V = buildNonAtomicBinOp( 303*8bcb0991SDimitry Andric B, Op, V, 304*8bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 305*8bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 306*8bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 307*8bcb0991SDimitry Andric V = buildNonAtomicBinOp( 308*8bcb0991SDimitry Andric B, Op, V, 309*8bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 310*8bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 311*8bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 312*8bcb0991SDimitry Andric } else { 313*8bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 314*8bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 315*8bcb0991SDimitry Andric 316*8bcb0991SDimitry Andric // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 317*8bcb0991SDimitry Andric // 48..63). 318*8bcb0991SDimitry Andric Value *const PermX = 319*8bcb0991SDimitry Andric B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), 320*8bcb0991SDimitry Andric B.getFalse(), B.getFalse()}); 321*8bcb0991SDimitry Andric V = buildNonAtomicBinOp( 322*8bcb0991SDimitry Andric B, Op, V, 323*8bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 324*8bcb0991SDimitry Andric {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 325*8bcb0991SDimitry Andric B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 326*8bcb0991SDimitry Andric if (!ST->isWave32()) { 327*8bcb0991SDimitry Andric // Combine lane 31 into lanes 32..63. 328*8bcb0991SDimitry Andric Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); 329*8bcb0991SDimitry Andric V = buildNonAtomicBinOp( 330*8bcb0991SDimitry Andric B, Op, V, 331*8bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 332*8bcb0991SDimitry Andric {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 333*8bcb0991SDimitry Andric B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 334*8bcb0991SDimitry Andric } 335*8bcb0991SDimitry Andric } 336*8bcb0991SDimitry Andric return V; 337*8bcb0991SDimitry Andric } 338*8bcb0991SDimitry Andric 339*8bcb0991SDimitry Andric // Use the builder to create a shift right of V across the wavefront, with all 340*8bcb0991SDimitry Andric // lanes active, to turn an inclusive scan into an exclusive scan. 341*8bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, 342*8bcb0991SDimitry Andric Value *const Identity) const { 343*8bcb0991SDimitry Andric Type *const Ty = V->getType(); 344*8bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 345*8bcb0991SDimitry Andric Function *UpdateDPP = 346*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 347*8bcb0991SDimitry Andric Function *ReadLane = 348*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 349*8bcb0991SDimitry Andric Function *WriteLane = 350*8bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 351*8bcb0991SDimitry Andric 352*8bcb0991SDimitry Andric if (ST->hasDPPWavefrontShifts()) { 353*8bcb0991SDimitry Andric // GFX9 has DPP wavefront shift operations. 354*8bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 355*8bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 356*8bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()}); 357*8bcb0991SDimitry Andric } else { 358*8bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 359*8bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 360*8bcb0991SDimitry Andric Value *Old = V; 361*8bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 362*8bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 363*8bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 364*8bcb0991SDimitry Andric 365*8bcb0991SDimitry Andric // Copy the old lane 15 to the new lane 16. 366*8bcb0991SDimitry Andric V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 367*8bcb0991SDimitry Andric B.getInt32(16), V}); 368*8bcb0991SDimitry Andric 369*8bcb0991SDimitry Andric if (!ST->isWave32()) { 370*8bcb0991SDimitry Andric // Copy the old lane 31 to the new lane 32. 371*8bcb0991SDimitry Andric V = B.CreateCall( 372*8bcb0991SDimitry Andric WriteLane, 373*8bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 374*8bcb0991SDimitry Andric 375*8bcb0991SDimitry Andric // Copy the old lane 47 to the new lane 48. 376*8bcb0991SDimitry Andric V = B.CreateCall( 377*8bcb0991SDimitry Andric WriteLane, 378*8bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 379*8bcb0991SDimitry Andric } 380*8bcb0991SDimitry Andric } 381*8bcb0991SDimitry Andric 382*8bcb0991SDimitry Andric return V; 383*8bcb0991SDimitry Andric } 384*8bcb0991SDimitry Andric 3850b57cec5SDimitry Andric static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 3860b57cec5SDimitry Andric unsigned BitWidth) { 3870b57cec5SDimitry Andric switch (Op) { 3880b57cec5SDimitry Andric default: 3890b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 3900b57cec5SDimitry Andric case AtomicRMWInst::Add: 3910b57cec5SDimitry Andric case AtomicRMWInst::Sub: 3920b57cec5SDimitry Andric case AtomicRMWInst::Or: 3930b57cec5SDimitry Andric case AtomicRMWInst::Xor: 3940b57cec5SDimitry Andric case AtomicRMWInst::UMax: 3950b57cec5SDimitry Andric return APInt::getMinValue(BitWidth); 3960b57cec5SDimitry Andric case AtomicRMWInst::And: 3970b57cec5SDimitry Andric case AtomicRMWInst::UMin: 3980b57cec5SDimitry Andric return APInt::getMaxValue(BitWidth); 3990b57cec5SDimitry Andric case AtomicRMWInst::Max: 4000b57cec5SDimitry Andric return APInt::getSignedMinValue(BitWidth); 4010b57cec5SDimitry Andric case AtomicRMWInst::Min: 4020b57cec5SDimitry Andric return APInt::getSignedMaxValue(BitWidth); 4030b57cec5SDimitry Andric } 4040b57cec5SDimitry Andric } 4050b57cec5SDimitry Andric 4060b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 4070b57cec5SDimitry Andric AtomicRMWInst::BinOp Op, 4080b57cec5SDimitry Andric unsigned ValIdx, 4090b57cec5SDimitry Andric bool ValDivergent) const { 4100b57cec5SDimitry Andric // Start building just before the instruction. 4110b57cec5SDimitry Andric IRBuilder<> B(&I); 4120b57cec5SDimitry Andric 4130b57cec5SDimitry Andric // If we are in a pixel shader, because of how we have to mask out helper 4140b57cec5SDimitry Andric // lane invocations, we need to record the entry and exit BB's. 4150b57cec5SDimitry Andric BasicBlock *PixelEntryBB = nullptr; 4160b57cec5SDimitry Andric BasicBlock *PixelExitBB = nullptr; 4170b57cec5SDimitry Andric 4180b57cec5SDimitry Andric // If we're optimizing an atomic within a pixel shader, we need to wrap the 4190b57cec5SDimitry Andric // entire atomic operation in a helper-lane check. We do not want any helper 4200b57cec5SDimitry Andric // lanes that are around only for the purposes of derivatives to take part 4210b57cec5SDimitry Andric // in any cross-lane communication, and we use a branch on whether the lane is 4220b57cec5SDimitry Andric // live to do this. 4230b57cec5SDimitry Andric if (IsPixelShader) { 4240b57cec5SDimitry Andric // Record I's original position as the entry block. 4250b57cec5SDimitry Andric PixelEntryBB = I.getParent(); 4260b57cec5SDimitry Andric 4270b57cec5SDimitry Andric Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 4280b57cec5SDimitry Andric Instruction *const NonHelperTerminator = 4290b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 4300b57cec5SDimitry Andric 4310b57cec5SDimitry Andric // Record I's new position as the exit block. 4320b57cec5SDimitry Andric PixelExitBB = I.getParent(); 4330b57cec5SDimitry Andric 4340b57cec5SDimitry Andric I.moveBefore(NonHelperTerminator); 4350b57cec5SDimitry Andric B.SetInsertPoint(&I); 4360b57cec5SDimitry Andric } 4370b57cec5SDimitry Andric 4380b57cec5SDimitry Andric Type *const Ty = I.getType(); 4390b57cec5SDimitry Andric const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 4400b57cec5SDimitry Andric Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); 4410b57cec5SDimitry Andric 4420b57cec5SDimitry Andric // This is the value in the atomic operation we need to combine in order to 4430b57cec5SDimitry Andric // reduce the number of atomic operations. 4440b57cec5SDimitry Andric Value *const V = I.getOperand(ValIdx); 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront, and we do 4470b57cec5SDimitry Andric // this by doing a ballot of active lanes. 448*8bcb0991SDimitry Andric Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 4490b57cec5SDimitry Andric CallInst *const Ballot = B.CreateIntrinsic( 450*8bcb0991SDimitry Andric Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()}, 4510b57cec5SDimitry Andric {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); 4520b57cec5SDimitry Andric 4530b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront that are 4540b57cec5SDimitry Andric // below us. If we counted each lane linearly starting from 0, a lane is 4550b57cec5SDimitry Andric // below us only if its associated index was less than ours. We do this by 4560b57cec5SDimitry Andric // using the mbcnt intrinsic. 457*8bcb0991SDimitry Andric Value *Mbcnt; 458*8bcb0991SDimitry Andric if (ST->isWave32()) { 459*8bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 460*8bcb0991SDimitry Andric {Ballot, B.getInt32(0)}); 461*8bcb0991SDimitry Andric } else { 4620b57cec5SDimitry Andric Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 4630b57cec5SDimitry Andric Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 4640b57cec5SDimitry Andric Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 465*8bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 466*8bcb0991SDimitry Andric {ExtractLo, B.getInt32(0)}); 467*8bcb0991SDimitry Andric Mbcnt = 468*8bcb0991SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 469*8bcb0991SDimitry Andric } 470*8bcb0991SDimitry Andric Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 4730b57cec5SDimitry Andric 4740b57cec5SDimitry Andric Value *ExclScan = nullptr; 4750b57cec5SDimitry Andric Value *NewV = nullptr; 4760b57cec5SDimitry Andric 4770b57cec5SDimitry Andric // If we have a divergent value in each lane, we need to combine the value 4780b57cec5SDimitry Andric // using DPP. 4790b57cec5SDimitry Andric if (ValDivergent) { 4800b57cec5SDimitry Andric // First we need to set all inactive invocations to the identity value, so 4810b57cec5SDimitry Andric // that they can correctly contribute to the final result. 482*8bcb0991SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 4830b57cec5SDimitry Andric 484*8bcb0991SDimitry Andric const AtomicRMWInst::BinOp ScanOp = 485*8bcb0991SDimitry Andric Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 486*8bcb0991SDimitry Andric NewV = buildScan(B, ScanOp, NewV, Identity); 487*8bcb0991SDimitry Andric ExclScan = buildShiftRight(B, NewV, Identity); 4880b57cec5SDimitry Andric 4890b57cec5SDimitry Andric // Read the value from the last lane, which has accumlated the values of 4900b57cec5SDimitry Andric // each active lane in the wavefront. This will be our new value which we 4910b57cec5SDimitry Andric // will provide to the atomic operation. 492*8bcb0991SDimitry Andric Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 4930b57cec5SDimitry Andric if (TyBitWidth == 64) { 4940b57cec5SDimitry Andric Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); 4950b57cec5SDimitry Andric Value *const ExtractHi = 496*8bcb0991SDimitry Andric B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); 4970b57cec5SDimitry Andric CallInst *const ReadLaneLo = B.CreateIntrinsic( 498*8bcb0991SDimitry Andric Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); 4990b57cec5SDimitry Andric CallInst *const ReadLaneHi = B.CreateIntrinsic( 500*8bcb0991SDimitry Andric Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); 5010b57cec5SDimitry Andric Value *const PartialInsert = B.CreateInsertElement( 5020b57cec5SDimitry Andric UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); 5030b57cec5SDimitry Andric Value *const Insert = 5040b57cec5SDimitry Andric B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); 5050b57cec5SDimitry Andric NewV = B.CreateBitCast(Insert, Ty); 5060b57cec5SDimitry Andric } else if (TyBitWidth == 32) { 5070b57cec5SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 508*8bcb0991SDimitry Andric {NewV, LastLaneIdx}); 5090b57cec5SDimitry Andric } else { 5100b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic bit width"); 5110b57cec5SDimitry Andric } 5120b57cec5SDimitry Andric 5130b57cec5SDimitry Andric // Finally mark the readlanes in the WWM section. 5140b57cec5SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); 5150b57cec5SDimitry Andric } else { 5160b57cec5SDimitry Andric switch (Op) { 5170b57cec5SDimitry Andric default: 5180b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric case AtomicRMWInst::Add: 5210b57cec5SDimitry Andric case AtomicRMWInst::Sub: { 5220b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5230b57cec5SDimitry Andric // old value times the number of active lanes. 5240b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5250b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 5260b57cec5SDimitry Andric NewV = B.CreateMul(V, Ctpop); 5270b57cec5SDimitry Andric break; 5280b57cec5SDimitry Andric } 5290b57cec5SDimitry Andric 5300b57cec5SDimitry Andric case AtomicRMWInst::And: 5310b57cec5SDimitry Andric case AtomicRMWInst::Or: 5320b57cec5SDimitry Andric case AtomicRMWInst::Max: 5330b57cec5SDimitry Andric case AtomicRMWInst::Min: 5340b57cec5SDimitry Andric case AtomicRMWInst::UMax: 5350b57cec5SDimitry Andric case AtomicRMWInst::UMin: 5360b57cec5SDimitry Andric // These operations with a uniform value are idempotent: doing the atomic 5370b57cec5SDimitry Andric // operation multiple times has the same effect as doing it once. 5380b57cec5SDimitry Andric NewV = V; 5390b57cec5SDimitry Andric break; 5400b57cec5SDimitry Andric 5410b57cec5SDimitry Andric case AtomicRMWInst::Xor: 5420b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5430b57cec5SDimitry Andric // old value times the parity of the number of active lanes. 5440b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5450b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 5460b57cec5SDimitry Andric NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1)); 5470b57cec5SDimitry Andric break; 5480b57cec5SDimitry Andric } 5490b57cec5SDimitry Andric } 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric // We only want a single lane to enter our new control flow, and we do this 5520b57cec5SDimitry Andric // by checking if there are any active lanes below us. Only one lane will 5530b57cec5SDimitry Andric // have 0 active lanes below us, so that will be the only one to progress. 5540b57cec5SDimitry Andric Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric // Store I's original basic block before we split the block. 5570b57cec5SDimitry Andric BasicBlock *const EntryBB = I.getParent(); 5580b57cec5SDimitry Andric 5590b57cec5SDimitry Andric // We need to introduce some new control flow to force a single lane to be 5600b57cec5SDimitry Andric // active. We do this by splitting I's basic block at I, and introducing the 5610b57cec5SDimitry Andric // new block such that: 5620b57cec5SDimitry Andric // entry --> single_lane -\ 5630b57cec5SDimitry Andric // \------------------> exit 5640b57cec5SDimitry Andric Instruction *const SingleLaneTerminator = 5650b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric // Move the IR builder into single_lane next. 5680b57cec5SDimitry Andric B.SetInsertPoint(SingleLaneTerminator); 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric // Clone the original atomic operation into single lane, replacing the 5710b57cec5SDimitry Andric // original value with our newly created one. 5720b57cec5SDimitry Andric Instruction *const NewI = I.clone(); 5730b57cec5SDimitry Andric B.Insert(NewI); 5740b57cec5SDimitry Andric NewI->setOperand(ValIdx, NewV); 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric // Move the IR builder into exit next, and start inserting just before the 5770b57cec5SDimitry Andric // original instruction. 5780b57cec5SDimitry Andric B.SetInsertPoint(&I); 5790b57cec5SDimitry Andric 580*8bcb0991SDimitry Andric const bool NeedResult = !I.use_empty(); 581*8bcb0991SDimitry Andric if (NeedResult) { 5820b57cec5SDimitry Andric // Create a PHI node to get our new atomic result into the exit block. 5830b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 5840b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), EntryBB); 5850b57cec5SDimitry Andric PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric // We need to broadcast the value who was the lowest active lane (the first 5880b57cec5SDimitry Andric // lane) to all other lanes in the wavefront. We use an intrinsic for this, 5890b57cec5SDimitry Andric // but have to handle 64-bit broadcasts with two calls to this intrinsic. 5900b57cec5SDimitry Andric Value *BroadcastI = nullptr; 5910b57cec5SDimitry Andric 5920b57cec5SDimitry Andric if (TyBitWidth == 64) { 5930b57cec5SDimitry Andric Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 5940b57cec5SDimitry Andric Value *const ExtractHi = 595*8bcb0991SDimitry Andric B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 5960b57cec5SDimitry Andric CallInst *const ReadFirstLaneLo = 5970b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 5980b57cec5SDimitry Andric CallInst *const ReadFirstLaneHi = 5990b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 6000b57cec5SDimitry Andric Value *const PartialInsert = B.CreateInsertElement( 6010b57cec5SDimitry Andric UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 6020b57cec5SDimitry Andric Value *const Insert = 6030b57cec5SDimitry Andric B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 6040b57cec5SDimitry Andric BroadcastI = B.CreateBitCast(Insert, Ty); 6050b57cec5SDimitry Andric } else if (TyBitWidth == 32) { 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 6080b57cec5SDimitry Andric } else { 6090b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic bit width"); 6100b57cec5SDimitry Andric } 6110b57cec5SDimitry Andric 6120b57cec5SDimitry Andric // Now that we have the result of our single atomic operation, we need to 613*8bcb0991SDimitry Andric // get our individual lane's slice into the result. We use the lane offset 614*8bcb0991SDimitry Andric // we previously calculated combined with the atomic result value we got 615*8bcb0991SDimitry Andric // from the first lane, to get our lane's index into the atomic result. 6160b57cec5SDimitry Andric Value *LaneOffset = nullptr; 6170b57cec5SDimitry Andric if (ValDivergent) { 6180b57cec5SDimitry Andric LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); 6190b57cec5SDimitry Andric } else { 6200b57cec5SDimitry Andric switch (Op) { 6210b57cec5SDimitry Andric default: 6220b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 6230b57cec5SDimitry Andric case AtomicRMWInst::Add: 6240b57cec5SDimitry Andric case AtomicRMWInst::Sub: 6250b57cec5SDimitry Andric LaneOffset = B.CreateMul(V, Mbcnt); 6260b57cec5SDimitry Andric break; 6270b57cec5SDimitry Andric case AtomicRMWInst::And: 6280b57cec5SDimitry Andric case AtomicRMWInst::Or: 6290b57cec5SDimitry Andric case AtomicRMWInst::Max: 6300b57cec5SDimitry Andric case AtomicRMWInst::Min: 6310b57cec5SDimitry Andric case AtomicRMWInst::UMax: 6320b57cec5SDimitry Andric case AtomicRMWInst::UMin: 6330b57cec5SDimitry Andric LaneOffset = B.CreateSelect(Cond, Identity, V); 6340b57cec5SDimitry Andric break; 6350b57cec5SDimitry Andric case AtomicRMWInst::Xor: 6360b57cec5SDimitry Andric LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); 6370b57cec5SDimitry Andric break; 6380b57cec5SDimitry Andric } 6390b57cec5SDimitry Andric } 6400b57cec5SDimitry Andric Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 6410b57cec5SDimitry Andric 6420b57cec5SDimitry Andric if (IsPixelShader) { 6430b57cec5SDimitry Andric // Need a final PHI to reconverge to above the helper lane branch mask. 6440b57cec5SDimitry Andric B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 6450b57cec5SDimitry Andric 6460b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 6470b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 6480b57cec5SDimitry Andric PHI->addIncoming(Result, I.getParent()); 6490b57cec5SDimitry Andric I.replaceAllUsesWith(PHI); 6500b57cec5SDimitry Andric } else { 6510b57cec5SDimitry Andric // Replace the original atomic instruction with the new one. 6520b57cec5SDimitry Andric I.replaceAllUsesWith(Result); 6530b57cec5SDimitry Andric } 654*8bcb0991SDimitry Andric } 6550b57cec5SDimitry Andric 6560b57cec5SDimitry Andric // And delete the original. 6570b57cec5SDimitry Andric I.eraseFromParent(); 6580b57cec5SDimitry Andric } 6590b57cec5SDimitry Andric 6600b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 6610b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 6620b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 6630b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 6640b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 6650b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 6660b57cec5SDimitry Andric 6670b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 6680b57cec5SDimitry Andric return new AMDGPUAtomicOptimizer(); 6690b57cec5SDimitry Andric } 670