10b57cec5SDimitry Andric //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass optimizes atomic operations by using a single lane of a wavefront 110b57cec5SDimitry Andric /// to perform the atomic operation, thus reducing contention on that memory 120b57cec5SDimitry Andric /// location. 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 180b57cec5SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 190b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 200b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 210b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 22e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 23480093f4SDimitry Andric #include "llvm/InitializePasses.h" 24e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 250b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 260b57cec5SDimitry Andric 270b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-atomic-optimizer" 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric using namespace llvm; 308bcb0991SDimitry Andric using namespace llvm::AMDGPU; 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric namespace { 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric struct ReplacementInfo { 350b57cec5SDimitry Andric Instruction *I; 360b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 370b57cec5SDimitry Andric unsigned ValIdx; 380b57cec5SDimitry Andric bool ValDivergent; 390b57cec5SDimitry Andric }; 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric class AMDGPUAtomicOptimizer : public FunctionPass, 420b57cec5SDimitry Andric public InstVisitor<AMDGPUAtomicOptimizer> { 430b57cec5SDimitry Andric private: 440b57cec5SDimitry Andric SmallVector<ReplacementInfo, 8> ToReplace; 450b57cec5SDimitry Andric const LegacyDivergenceAnalysis *DA; 460b57cec5SDimitry Andric const DataLayout *DL; 470b57cec5SDimitry Andric DominatorTree *DT; 488bcb0991SDimitry Andric const GCNSubtarget *ST; 490b57cec5SDimitry Andric bool IsPixelShader; 500b57cec5SDimitry Andric 51*fe6060f1SDimitry Andric Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 52*fe6060f1SDimitry Andric Value *const Identity) const; 538bcb0991SDimitry Andric Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 548bcb0991SDimitry Andric Value *const Identity) const; 558bcb0991SDimitry Andric Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 560b57cec5SDimitry Andric void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 570b57cec5SDimitry Andric bool ValDivergent) const; 580b57cec5SDimitry Andric 590b57cec5SDimitry Andric public: 600b57cec5SDimitry Andric static char ID; 610b57cec5SDimitry Andric 620b57cec5SDimitry Andric AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 630b57cec5SDimitry Andric 640b57cec5SDimitry Andric bool runOnFunction(Function &F) override; 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 670b57cec5SDimitry Andric AU.addPreserved<DominatorTreeWrapperPass>(); 680b57cec5SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 690b57cec5SDimitry Andric AU.addRequired<TargetPassConfig>(); 700b57cec5SDimitry Andric } 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric void visitAtomicRMWInst(AtomicRMWInst &I); 730b57cec5SDimitry Andric void visitIntrinsicInst(IntrinsicInst &I); 740b57cec5SDimitry Andric }; 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric } // namespace 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric char AMDGPUAtomicOptimizer::ID = 0; 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 810b57cec5SDimitry Andric 820b57cec5SDimitry Andric bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 830b57cec5SDimitry Andric if (skipFunction(F)) { 840b57cec5SDimitry Andric return false; 850b57cec5SDimitry Andric } 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 880b57cec5SDimitry Andric DL = &F.getParent()->getDataLayout(); 890b57cec5SDimitry Andric DominatorTreeWrapperPass *const DTW = 900b57cec5SDimitry Andric getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 910b57cec5SDimitry Andric DT = DTW ? &DTW->getDomTree() : nullptr; 920b57cec5SDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 930b57cec5SDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>(); 948bcb0991SDimitry Andric ST = &TM.getSubtarget<GCNSubtarget>(F); 950b57cec5SDimitry Andric IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 960b57cec5SDimitry Andric 970b57cec5SDimitry Andric visit(F); 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric const bool Changed = !ToReplace.empty(); 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric for (ReplacementInfo &Info : ToReplace) { 1020b57cec5SDimitry Andric optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 1030b57cec5SDimitry Andric } 1040b57cec5SDimitry Andric 1050b57cec5SDimitry Andric ToReplace.clear(); 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric return Changed; 1080b57cec5SDimitry Andric } 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 1110b57cec5SDimitry Andric // Early exit for unhandled address space atomic instructions. 1120b57cec5SDimitry Andric switch (I.getPointerAddressSpace()) { 1130b57cec5SDimitry Andric default: 1140b57cec5SDimitry Andric return; 1150b57cec5SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 1160b57cec5SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 1170b57cec5SDimitry Andric break; 1180b57cec5SDimitry Andric } 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric AtomicRMWInst::BinOp Op = I.getOperation(); 1210b57cec5SDimitry Andric 1220b57cec5SDimitry Andric switch (Op) { 1230b57cec5SDimitry Andric default: 1240b57cec5SDimitry Andric return; 1250b57cec5SDimitry Andric case AtomicRMWInst::Add: 1260b57cec5SDimitry Andric case AtomicRMWInst::Sub: 1270b57cec5SDimitry Andric case AtomicRMWInst::And: 1280b57cec5SDimitry Andric case AtomicRMWInst::Or: 1290b57cec5SDimitry Andric case AtomicRMWInst::Xor: 1300b57cec5SDimitry Andric case AtomicRMWInst::Max: 1310b57cec5SDimitry Andric case AtomicRMWInst::Min: 1320b57cec5SDimitry Andric case AtomicRMWInst::UMax: 1330b57cec5SDimitry Andric case AtomicRMWInst::UMin: 1340b57cec5SDimitry Andric break; 1350b57cec5SDimitry Andric } 1360b57cec5SDimitry Andric 1370b57cec5SDimitry Andric const unsigned PtrIdx = 0; 1380b57cec5SDimitry Andric const unsigned ValIdx = 1; 1390b57cec5SDimitry Andric 1400b57cec5SDimitry Andric // If the pointer operand is divergent, then each lane is doing an atomic 1410b57cec5SDimitry Andric // operation on a different address, and we cannot optimize that. 1428bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { 1430b57cec5SDimitry Andric return; 1440b57cec5SDimitry Andric } 1450b57cec5SDimitry Andric 1468bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 1470b57cec5SDimitry Andric 1480b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 1490b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 1500b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 1510b57cec5SDimitry Andric // bits. 1528bcb0991SDimitry Andric if (ValDivergent && 1538bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 1540b57cec5SDimitry Andric return; 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 1580b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 1590b57cec5SDimitry Andric // remember the instruction so we can come back to it. 1600b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 1610b57cec5SDimitry Andric 1620b57cec5SDimitry Andric ToReplace.push_back(Info); 1630b57cec5SDimitry Andric } 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 1660b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 1690b57cec5SDimitry Andric default: 1700b57cec5SDimitry Andric return; 1710b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_add: 1720b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 1730b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 1740b57cec5SDimitry Andric Op = AtomicRMWInst::Add; 1750b57cec5SDimitry Andric break; 1760b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_sub: 1770b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 1780b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 1790b57cec5SDimitry Andric Op = AtomicRMWInst::Sub; 1800b57cec5SDimitry Andric break; 1810b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_and: 1820b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 1830b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 1840b57cec5SDimitry Andric Op = AtomicRMWInst::And; 1850b57cec5SDimitry Andric break; 1860b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_or: 1870b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 1880b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 1890b57cec5SDimitry Andric Op = AtomicRMWInst::Or; 1900b57cec5SDimitry Andric break; 1910b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_xor: 1920b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 1930b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 1940b57cec5SDimitry Andric Op = AtomicRMWInst::Xor; 1950b57cec5SDimitry Andric break; 1960b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smin: 1970b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 1980b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 1990b57cec5SDimitry Andric Op = AtomicRMWInst::Min; 2000b57cec5SDimitry Andric break; 2010b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umin: 2020b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2030b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2040b57cec5SDimitry Andric Op = AtomicRMWInst::UMin; 2050b57cec5SDimitry Andric break; 2060b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smax: 2070b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2080b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2090b57cec5SDimitry Andric Op = AtomicRMWInst::Max; 2100b57cec5SDimitry Andric break; 2110b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umax: 2120b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2130b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2140b57cec5SDimitry Andric Op = AtomicRMWInst::UMax; 2150b57cec5SDimitry Andric break; 2160b57cec5SDimitry Andric } 2170b57cec5SDimitry Andric 2180b57cec5SDimitry Andric const unsigned ValIdx = 0; 2190b57cec5SDimitry Andric 2208bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 2210b57cec5SDimitry Andric 2220b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 2230b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 2240b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 2250b57cec5SDimitry Andric // bits. 2268bcb0991SDimitry Andric if (ValDivergent && 2278bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 2280b57cec5SDimitry Andric return; 2290b57cec5SDimitry Andric } 2300b57cec5SDimitry Andric 2310b57cec5SDimitry Andric // If any of the other arguments to the intrinsic are divergent, we can't 2320b57cec5SDimitry Andric // optimize the operation. 2330b57cec5SDimitry Andric for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 2348bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(Idx))) { 2350b57cec5SDimitry Andric return; 2360b57cec5SDimitry Andric } 2370b57cec5SDimitry Andric } 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 2400b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 2410b57cec5SDimitry Andric // remember the instruction so we can come back to it. 2420b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric ToReplace.push_back(Info); 2450b57cec5SDimitry Andric } 2460b57cec5SDimitry Andric 2470b57cec5SDimitry Andric // Use the builder to create the non-atomic counterpart of the specified 2480b57cec5SDimitry Andric // atomicrmw binary op. 2490b57cec5SDimitry Andric static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 2500b57cec5SDimitry Andric Value *LHS, Value *RHS) { 2510b57cec5SDimitry Andric CmpInst::Predicate Pred; 2520b57cec5SDimitry Andric 2530b57cec5SDimitry Andric switch (Op) { 2540b57cec5SDimitry Andric default: 2550b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 2560b57cec5SDimitry Andric case AtomicRMWInst::Add: 2570b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Add, LHS, RHS); 2580b57cec5SDimitry Andric case AtomicRMWInst::Sub: 2590b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Sub, LHS, RHS); 2600b57cec5SDimitry Andric case AtomicRMWInst::And: 2610b57cec5SDimitry Andric return B.CreateBinOp(Instruction::And, LHS, RHS); 2620b57cec5SDimitry Andric case AtomicRMWInst::Or: 2630b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Or, LHS, RHS); 2640b57cec5SDimitry Andric case AtomicRMWInst::Xor: 2650b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Xor, LHS, RHS); 2660b57cec5SDimitry Andric 2670b57cec5SDimitry Andric case AtomicRMWInst::Max: 2680b57cec5SDimitry Andric Pred = CmpInst::ICMP_SGT; 2690b57cec5SDimitry Andric break; 2700b57cec5SDimitry Andric case AtomicRMWInst::Min: 2710b57cec5SDimitry Andric Pred = CmpInst::ICMP_SLT; 2720b57cec5SDimitry Andric break; 2730b57cec5SDimitry Andric case AtomicRMWInst::UMax: 2740b57cec5SDimitry Andric Pred = CmpInst::ICMP_UGT; 2750b57cec5SDimitry Andric break; 2760b57cec5SDimitry Andric case AtomicRMWInst::UMin: 2770b57cec5SDimitry Andric Pred = CmpInst::ICMP_ULT; 2780b57cec5SDimitry Andric break; 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric Value *Cond = B.CreateICmp(Pred, LHS, RHS); 2810b57cec5SDimitry Andric return B.CreateSelect(Cond, LHS, RHS); 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 284*fe6060f1SDimitry Andric // Use the builder to create a reduction of V across the wavefront, with all 285*fe6060f1SDimitry Andric // lanes active, returning the same result in all lanes. 286*fe6060f1SDimitry Andric Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, 287*fe6060f1SDimitry Andric AtomicRMWInst::BinOp Op, Value *V, 288*fe6060f1SDimitry Andric Value *const Identity) const { 289*fe6060f1SDimitry Andric Type *const Ty = V->getType(); 290*fe6060f1SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 291*fe6060f1SDimitry Andric Function *UpdateDPP = 292*fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 293*fe6060f1SDimitry Andric 294*fe6060f1SDimitry Andric // Reduce within each row of 16 lanes. 295*fe6060f1SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 296*fe6060f1SDimitry Andric V = buildNonAtomicBinOp( 297*fe6060f1SDimitry Andric B, Op, V, 298*fe6060f1SDimitry Andric B.CreateCall(UpdateDPP, 299*fe6060f1SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 300*fe6060f1SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 301*fe6060f1SDimitry Andric } 302*fe6060f1SDimitry Andric 303*fe6060f1SDimitry Andric // Reduce within each pair of rows (i.e. 32 lanes). 304*fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 305*fe6060f1SDimitry Andric V = buildNonAtomicBinOp( 306*fe6060f1SDimitry Andric B, Op, V, 307*fe6060f1SDimitry Andric B.CreateIntrinsic( 308*fe6060f1SDimitry Andric Intrinsic::amdgcn_permlanex16, {}, 309*fe6060f1SDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 310*fe6060f1SDimitry Andric 311*fe6060f1SDimitry Andric if (ST->isWave32()) 312*fe6060f1SDimitry Andric return V; 313*fe6060f1SDimitry Andric 314*fe6060f1SDimitry Andric // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 315*fe6060f1SDimitry Andric // combine them with a scalar operation. 316*fe6060f1SDimitry Andric Function *ReadLane = 317*fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 318*fe6060f1SDimitry Andric Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 319*fe6060f1SDimitry Andric Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 320*fe6060f1SDimitry Andric return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 321*fe6060f1SDimitry Andric } 322*fe6060f1SDimitry Andric 3238bcb0991SDimitry Andric // Use the builder to create an inclusive scan of V across the wavefront, with 3248bcb0991SDimitry Andric // all lanes active. 3258bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 3268bcb0991SDimitry Andric Value *V, Value *const Identity) const { 3278bcb0991SDimitry Andric Type *const Ty = V->getType(); 3288bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 3298bcb0991SDimitry Andric Function *UpdateDPP = 3308bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 3318bcb0991SDimitry Andric 3328bcb0991SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 3338bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3348bcb0991SDimitry Andric B, Op, V, 3358bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3368bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 3378bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 3388bcb0991SDimitry Andric } 3398bcb0991SDimitry Andric if (ST->hasDPPBroadcasts()) { 3408bcb0991SDimitry Andric // GFX9 has DPP row broadcast operations. 3418bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3428bcb0991SDimitry Andric B, Op, V, 3438bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3448bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 3458bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 3468bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3478bcb0991SDimitry Andric B, Op, V, 3488bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3498bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 3508bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 3518bcb0991SDimitry Andric } else { 3528bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 3538bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 3548bcb0991SDimitry Andric 3558bcb0991SDimitry Andric // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 3568bcb0991SDimitry Andric // 48..63). 357*fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 358*fe6060f1SDimitry Andric Value *const PermX = B.CreateIntrinsic( 359*fe6060f1SDimitry Andric Intrinsic::amdgcn_permlanex16, {}, 360*fe6060f1SDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 3618bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3628bcb0991SDimitry Andric B, Op, V, 3638bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3648bcb0991SDimitry Andric {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 3658bcb0991SDimitry Andric B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 3668bcb0991SDimitry Andric if (!ST->isWave32()) { 3678bcb0991SDimitry Andric // Combine lane 31 into lanes 32..63. 368*fe6060f1SDimitry Andric Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 369*fe6060f1SDimitry Andric {V, B.getInt32(31)}); 3708bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3718bcb0991SDimitry Andric B, Op, V, 3728bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3738bcb0991SDimitry Andric {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 3748bcb0991SDimitry Andric B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 3758bcb0991SDimitry Andric } 3768bcb0991SDimitry Andric } 3778bcb0991SDimitry Andric return V; 3788bcb0991SDimitry Andric } 3798bcb0991SDimitry Andric 3808bcb0991SDimitry Andric // Use the builder to create a shift right of V across the wavefront, with all 3818bcb0991SDimitry Andric // lanes active, to turn an inclusive scan into an exclusive scan. 3828bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, 3838bcb0991SDimitry Andric Value *const Identity) const { 3848bcb0991SDimitry Andric Type *const Ty = V->getType(); 3858bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 3868bcb0991SDimitry Andric Function *UpdateDPP = 3878bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 3888bcb0991SDimitry Andric 3898bcb0991SDimitry Andric if (ST->hasDPPWavefrontShifts()) { 3908bcb0991SDimitry Andric // GFX9 has DPP wavefront shift operations. 3918bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 3928bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 3938bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()}); 3948bcb0991SDimitry Andric } else { 395*fe6060f1SDimitry Andric Function *ReadLane = 396*fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 397*fe6060f1SDimitry Andric Function *WriteLane = 398*fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 399*fe6060f1SDimitry Andric 4008bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 4018bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 4028bcb0991SDimitry Andric Value *Old = V; 4038bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 4048bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 4058bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 4068bcb0991SDimitry Andric 4078bcb0991SDimitry Andric // Copy the old lane 15 to the new lane 16. 4088bcb0991SDimitry Andric V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 4098bcb0991SDimitry Andric B.getInt32(16), V}); 4108bcb0991SDimitry Andric 4118bcb0991SDimitry Andric if (!ST->isWave32()) { 4128bcb0991SDimitry Andric // Copy the old lane 31 to the new lane 32. 4138bcb0991SDimitry Andric V = B.CreateCall( 4148bcb0991SDimitry Andric WriteLane, 4158bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 4168bcb0991SDimitry Andric 4178bcb0991SDimitry Andric // Copy the old lane 47 to the new lane 48. 4188bcb0991SDimitry Andric V = B.CreateCall( 4198bcb0991SDimitry Andric WriteLane, 4208bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 4218bcb0991SDimitry Andric } 4228bcb0991SDimitry Andric } 4238bcb0991SDimitry Andric 4248bcb0991SDimitry Andric return V; 4258bcb0991SDimitry Andric } 4268bcb0991SDimitry Andric 4270b57cec5SDimitry Andric static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 4280b57cec5SDimitry Andric unsigned BitWidth) { 4290b57cec5SDimitry Andric switch (Op) { 4300b57cec5SDimitry Andric default: 4310b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 4320b57cec5SDimitry Andric case AtomicRMWInst::Add: 4330b57cec5SDimitry Andric case AtomicRMWInst::Sub: 4340b57cec5SDimitry Andric case AtomicRMWInst::Or: 4350b57cec5SDimitry Andric case AtomicRMWInst::Xor: 4360b57cec5SDimitry Andric case AtomicRMWInst::UMax: 4370b57cec5SDimitry Andric return APInt::getMinValue(BitWidth); 4380b57cec5SDimitry Andric case AtomicRMWInst::And: 4390b57cec5SDimitry Andric case AtomicRMWInst::UMin: 4400b57cec5SDimitry Andric return APInt::getMaxValue(BitWidth); 4410b57cec5SDimitry Andric case AtomicRMWInst::Max: 4420b57cec5SDimitry Andric return APInt::getSignedMinValue(BitWidth); 4430b57cec5SDimitry Andric case AtomicRMWInst::Min: 4440b57cec5SDimitry Andric return APInt::getSignedMaxValue(BitWidth); 4450b57cec5SDimitry Andric } 4460b57cec5SDimitry Andric } 4470b57cec5SDimitry Andric 448e8d8bef9SDimitry Andric static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 449e8d8bef9SDimitry Andric const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 450e8d8bef9SDimitry Andric return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 451e8d8bef9SDimitry Andric } 452e8d8bef9SDimitry Andric 4530b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 4540b57cec5SDimitry Andric AtomicRMWInst::BinOp Op, 4550b57cec5SDimitry Andric unsigned ValIdx, 4560b57cec5SDimitry Andric bool ValDivergent) const { 4570b57cec5SDimitry Andric // Start building just before the instruction. 4580b57cec5SDimitry Andric IRBuilder<> B(&I); 4590b57cec5SDimitry Andric 4600b57cec5SDimitry Andric // If we are in a pixel shader, because of how we have to mask out helper 4610b57cec5SDimitry Andric // lane invocations, we need to record the entry and exit BB's. 4620b57cec5SDimitry Andric BasicBlock *PixelEntryBB = nullptr; 4630b57cec5SDimitry Andric BasicBlock *PixelExitBB = nullptr; 4640b57cec5SDimitry Andric 4650b57cec5SDimitry Andric // If we're optimizing an atomic within a pixel shader, we need to wrap the 4660b57cec5SDimitry Andric // entire atomic operation in a helper-lane check. We do not want any helper 4670b57cec5SDimitry Andric // lanes that are around only for the purposes of derivatives to take part 4680b57cec5SDimitry Andric // in any cross-lane communication, and we use a branch on whether the lane is 4690b57cec5SDimitry Andric // live to do this. 4700b57cec5SDimitry Andric if (IsPixelShader) { 4710b57cec5SDimitry Andric // Record I's original position as the entry block. 4720b57cec5SDimitry Andric PixelEntryBB = I.getParent(); 4730b57cec5SDimitry Andric 4740b57cec5SDimitry Andric Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 4750b57cec5SDimitry Andric Instruction *const NonHelperTerminator = 4760b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 4770b57cec5SDimitry Andric 4780b57cec5SDimitry Andric // Record I's new position as the exit block. 4790b57cec5SDimitry Andric PixelExitBB = I.getParent(); 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric I.moveBefore(NonHelperTerminator); 4820b57cec5SDimitry Andric B.SetInsertPoint(&I); 4830b57cec5SDimitry Andric } 4840b57cec5SDimitry Andric 4850b57cec5SDimitry Andric Type *const Ty = I.getType(); 4860b57cec5SDimitry Andric const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 4875ffd83dbSDimitry Andric auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 4880b57cec5SDimitry Andric 4890b57cec5SDimitry Andric // This is the value in the atomic operation we need to combine in order to 4900b57cec5SDimitry Andric // reduce the number of atomic operations. 4910b57cec5SDimitry Andric Value *const V = I.getOperand(ValIdx); 4920b57cec5SDimitry Andric 4930b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront, and we do 4940b57cec5SDimitry Andric // this by doing a ballot of active lanes. 4958bcb0991SDimitry Andric Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 4965ffd83dbSDimitry Andric CallInst *const Ballot = 4975ffd83dbSDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 4980b57cec5SDimitry Andric 4990b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront that are 5000b57cec5SDimitry Andric // below us. If we counted each lane linearly starting from 0, a lane is 5010b57cec5SDimitry Andric // below us only if its associated index was less than ours. We do this by 5020b57cec5SDimitry Andric // using the mbcnt intrinsic. 5038bcb0991SDimitry Andric Value *Mbcnt; 5048bcb0991SDimitry Andric if (ST->isWave32()) { 5058bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 5068bcb0991SDimitry Andric {Ballot, B.getInt32(0)}); 5078bcb0991SDimitry Andric } else { 5080b57cec5SDimitry Andric Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 5090b57cec5SDimitry Andric Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 5100b57cec5SDimitry Andric Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 5118bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 5128bcb0991SDimitry Andric {ExtractLo, B.getInt32(0)}); 5138bcb0991SDimitry Andric Mbcnt = 5148bcb0991SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 5158bcb0991SDimitry Andric } 5168bcb0991SDimitry Andric Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 5170b57cec5SDimitry Andric 5180b57cec5SDimitry Andric Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric Value *ExclScan = nullptr; 5210b57cec5SDimitry Andric Value *NewV = nullptr; 5220b57cec5SDimitry Andric 523*fe6060f1SDimitry Andric const bool NeedResult = !I.use_empty(); 524*fe6060f1SDimitry Andric 5250b57cec5SDimitry Andric // If we have a divergent value in each lane, we need to combine the value 5260b57cec5SDimitry Andric // using DPP. 5270b57cec5SDimitry Andric if (ValDivergent) { 5280b57cec5SDimitry Andric // First we need to set all inactive invocations to the identity value, so 5290b57cec5SDimitry Andric // that they can correctly contribute to the final result. 5308bcb0991SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 5310b57cec5SDimitry Andric 5328bcb0991SDimitry Andric const AtomicRMWInst::BinOp ScanOp = 5338bcb0991SDimitry Andric Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 534*fe6060f1SDimitry Andric if (!NeedResult && ST->hasPermLaneX16()) { 535*fe6060f1SDimitry Andric // On GFX10 the permlanex16 instruction helps us build a reduction without 536*fe6060f1SDimitry Andric // too many readlanes and writelanes, which are generally bad for 537*fe6060f1SDimitry Andric // performance. 538*fe6060f1SDimitry Andric NewV = buildReduction(B, ScanOp, NewV, Identity); 539*fe6060f1SDimitry Andric } else { 5408bcb0991SDimitry Andric NewV = buildScan(B, ScanOp, NewV, Identity); 541*fe6060f1SDimitry Andric if (NeedResult) 5428bcb0991SDimitry Andric ExclScan = buildShiftRight(B, NewV, Identity); 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric // Read the value from the last lane, which has accumlated the values of 5450b57cec5SDimitry Andric // each active lane in the wavefront. This will be our new value which we 5460b57cec5SDimitry Andric // will provide to the atomic operation. 5478bcb0991SDimitry Andric Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 548*fe6060f1SDimitry Andric assert(TyBitWidth == 32); 5490b57cec5SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 5508bcb0991SDimitry Andric {NewV, LastLaneIdx}); 5510b57cec5SDimitry Andric } 5520b57cec5SDimitry Andric 5530b57cec5SDimitry Andric // Finally mark the readlanes in the WWM section. 554*fe6060f1SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 5550b57cec5SDimitry Andric } else { 5560b57cec5SDimitry Andric switch (Op) { 5570b57cec5SDimitry Andric default: 5580b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric case AtomicRMWInst::Add: 5610b57cec5SDimitry Andric case AtomicRMWInst::Sub: { 5620b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5630b57cec5SDimitry Andric // old value times the number of active lanes. 5640b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5650b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 566e8d8bef9SDimitry Andric NewV = buildMul(B, V, Ctpop); 5670b57cec5SDimitry Andric break; 5680b57cec5SDimitry Andric } 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric case AtomicRMWInst::And: 5710b57cec5SDimitry Andric case AtomicRMWInst::Or: 5720b57cec5SDimitry Andric case AtomicRMWInst::Max: 5730b57cec5SDimitry Andric case AtomicRMWInst::Min: 5740b57cec5SDimitry Andric case AtomicRMWInst::UMax: 5750b57cec5SDimitry Andric case AtomicRMWInst::UMin: 5760b57cec5SDimitry Andric // These operations with a uniform value are idempotent: doing the atomic 5770b57cec5SDimitry Andric // operation multiple times has the same effect as doing it once. 5780b57cec5SDimitry Andric NewV = V; 5790b57cec5SDimitry Andric break; 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric case AtomicRMWInst::Xor: 5820b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5830b57cec5SDimitry Andric // old value times the parity of the number of active lanes. 5840b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5850b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 586e8d8bef9SDimitry Andric NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 5870b57cec5SDimitry Andric break; 5880b57cec5SDimitry Andric } 5890b57cec5SDimitry Andric } 5900b57cec5SDimitry Andric 5910b57cec5SDimitry Andric // We only want a single lane to enter our new control flow, and we do this 5920b57cec5SDimitry Andric // by checking if there are any active lanes below us. Only one lane will 5930b57cec5SDimitry Andric // have 0 active lanes below us, so that will be the only one to progress. 5940b57cec5SDimitry Andric Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 5950b57cec5SDimitry Andric 5960b57cec5SDimitry Andric // Store I's original basic block before we split the block. 5970b57cec5SDimitry Andric BasicBlock *const EntryBB = I.getParent(); 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric // We need to introduce some new control flow to force a single lane to be 6000b57cec5SDimitry Andric // active. We do this by splitting I's basic block at I, and introducing the 6010b57cec5SDimitry Andric // new block such that: 6020b57cec5SDimitry Andric // entry --> single_lane -\ 6030b57cec5SDimitry Andric // \------------------> exit 6040b57cec5SDimitry Andric Instruction *const SingleLaneTerminator = 6050b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric // Move the IR builder into single_lane next. 6080b57cec5SDimitry Andric B.SetInsertPoint(SingleLaneTerminator); 6090b57cec5SDimitry Andric 6100b57cec5SDimitry Andric // Clone the original atomic operation into single lane, replacing the 6110b57cec5SDimitry Andric // original value with our newly created one. 6120b57cec5SDimitry Andric Instruction *const NewI = I.clone(); 6130b57cec5SDimitry Andric B.Insert(NewI); 6140b57cec5SDimitry Andric NewI->setOperand(ValIdx, NewV); 6150b57cec5SDimitry Andric 6160b57cec5SDimitry Andric // Move the IR builder into exit next, and start inserting just before the 6170b57cec5SDimitry Andric // original instruction. 6180b57cec5SDimitry Andric B.SetInsertPoint(&I); 6190b57cec5SDimitry Andric 6208bcb0991SDimitry Andric if (NeedResult) { 6210b57cec5SDimitry Andric // Create a PHI node to get our new atomic result into the exit block. 6220b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 6230b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), EntryBB); 6240b57cec5SDimitry Andric PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 6250b57cec5SDimitry Andric 6260b57cec5SDimitry Andric // We need to broadcast the value who was the lowest active lane (the first 6270b57cec5SDimitry Andric // lane) to all other lanes in the wavefront. We use an intrinsic for this, 6280b57cec5SDimitry Andric // but have to handle 64-bit broadcasts with two calls to this intrinsic. 6290b57cec5SDimitry Andric Value *BroadcastI = nullptr; 6300b57cec5SDimitry Andric 6310b57cec5SDimitry Andric if (TyBitWidth == 64) { 6320b57cec5SDimitry Andric Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 6330b57cec5SDimitry Andric Value *const ExtractHi = 6348bcb0991SDimitry Andric B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 6350b57cec5SDimitry Andric CallInst *const ReadFirstLaneLo = 6360b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 6370b57cec5SDimitry Andric CallInst *const ReadFirstLaneHi = 6380b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 6390b57cec5SDimitry Andric Value *const PartialInsert = B.CreateInsertElement( 6400b57cec5SDimitry Andric UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 6410b57cec5SDimitry Andric Value *const Insert = 6420b57cec5SDimitry Andric B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 6430b57cec5SDimitry Andric BroadcastI = B.CreateBitCast(Insert, Ty); 6440b57cec5SDimitry Andric } else if (TyBitWidth == 32) { 6450b57cec5SDimitry Andric 6460b57cec5SDimitry Andric BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 6470b57cec5SDimitry Andric } else { 6480b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic bit width"); 6490b57cec5SDimitry Andric } 6500b57cec5SDimitry Andric 6510b57cec5SDimitry Andric // Now that we have the result of our single atomic operation, we need to 6528bcb0991SDimitry Andric // get our individual lane's slice into the result. We use the lane offset 6538bcb0991SDimitry Andric // we previously calculated combined with the atomic result value we got 6548bcb0991SDimitry Andric // from the first lane, to get our lane's index into the atomic result. 6550b57cec5SDimitry Andric Value *LaneOffset = nullptr; 6560b57cec5SDimitry Andric if (ValDivergent) { 657*fe6060f1SDimitry Andric LaneOffset = 658*fe6060f1SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 6590b57cec5SDimitry Andric } else { 6600b57cec5SDimitry Andric switch (Op) { 6610b57cec5SDimitry Andric default: 6620b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 6630b57cec5SDimitry Andric case AtomicRMWInst::Add: 6640b57cec5SDimitry Andric case AtomicRMWInst::Sub: 665e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, Mbcnt); 6660b57cec5SDimitry Andric break; 6670b57cec5SDimitry Andric case AtomicRMWInst::And: 6680b57cec5SDimitry Andric case AtomicRMWInst::Or: 6690b57cec5SDimitry Andric case AtomicRMWInst::Max: 6700b57cec5SDimitry Andric case AtomicRMWInst::Min: 6710b57cec5SDimitry Andric case AtomicRMWInst::UMax: 6720b57cec5SDimitry Andric case AtomicRMWInst::UMin: 6730b57cec5SDimitry Andric LaneOffset = B.CreateSelect(Cond, Identity, V); 6740b57cec5SDimitry Andric break; 6750b57cec5SDimitry Andric case AtomicRMWInst::Xor: 676e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 6770b57cec5SDimitry Andric break; 6780b57cec5SDimitry Andric } 6790b57cec5SDimitry Andric } 6800b57cec5SDimitry Andric Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 6810b57cec5SDimitry Andric 6820b57cec5SDimitry Andric if (IsPixelShader) { 6830b57cec5SDimitry Andric // Need a final PHI to reconverge to above the helper lane branch mask. 6840b57cec5SDimitry Andric B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 6850b57cec5SDimitry Andric 6860b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 6870b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 6880b57cec5SDimitry Andric PHI->addIncoming(Result, I.getParent()); 6890b57cec5SDimitry Andric I.replaceAllUsesWith(PHI); 6900b57cec5SDimitry Andric } else { 6910b57cec5SDimitry Andric // Replace the original atomic instruction with the new one. 6920b57cec5SDimitry Andric I.replaceAllUsesWith(Result); 6930b57cec5SDimitry Andric } 6948bcb0991SDimitry Andric } 6950b57cec5SDimitry Andric 6960b57cec5SDimitry Andric // And delete the original. 6970b57cec5SDimitry Andric I.eraseFromParent(); 6980b57cec5SDimitry Andric } 6990b57cec5SDimitry Andric 7000b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 7010b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 7020b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 7030b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 7040b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 7050b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 7060b57cec5SDimitry Andric 7070b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 7080b57cec5SDimitry Andric return new AMDGPUAtomicOptimizer(); 7090b57cec5SDimitry Andric } 710