10b57cec5SDimitry Andric //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass optimizes atomic operations by using a single lane of a wavefront 110b57cec5SDimitry Andric /// to perform the atomic operation, thus reducing contention on that memory 120b57cec5SDimitry Andric /// location. 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 180b57cec5SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 190b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 200b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 210b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 22e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 23480093f4SDimitry Andric #include "llvm/InitializePasses.h" 24e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 250b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 260b57cec5SDimitry Andric 270b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-atomic-optimizer" 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric using namespace llvm; 308bcb0991SDimitry Andric using namespace llvm::AMDGPU; 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric namespace { 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric struct ReplacementInfo { 350b57cec5SDimitry Andric Instruction *I; 360b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 370b57cec5SDimitry Andric unsigned ValIdx; 380b57cec5SDimitry Andric bool ValDivergent; 390b57cec5SDimitry Andric }; 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric class AMDGPUAtomicOptimizer : public FunctionPass, 420b57cec5SDimitry Andric public InstVisitor<AMDGPUAtomicOptimizer> { 430b57cec5SDimitry Andric private: 440b57cec5SDimitry Andric SmallVector<ReplacementInfo, 8> ToReplace; 450b57cec5SDimitry Andric const LegacyDivergenceAnalysis *DA; 460b57cec5SDimitry Andric const DataLayout *DL; 470b57cec5SDimitry Andric DominatorTree *DT; 488bcb0991SDimitry Andric const GCNSubtarget *ST; 490b57cec5SDimitry Andric bool IsPixelShader; 500b57cec5SDimitry Andric 51fe6060f1SDimitry Andric Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 52fe6060f1SDimitry Andric Value *const Identity) const; 538bcb0991SDimitry Andric Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 548bcb0991SDimitry Andric Value *const Identity) const; 558bcb0991SDimitry Andric Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 560b57cec5SDimitry Andric void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 570b57cec5SDimitry Andric bool ValDivergent) const; 580b57cec5SDimitry Andric 590b57cec5SDimitry Andric public: 600b57cec5SDimitry Andric static char ID; 610b57cec5SDimitry Andric 620b57cec5SDimitry Andric AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 630b57cec5SDimitry Andric 640b57cec5SDimitry Andric bool runOnFunction(Function &F) override; 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 670b57cec5SDimitry Andric AU.addPreserved<DominatorTreeWrapperPass>(); 680b57cec5SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 690b57cec5SDimitry Andric AU.addRequired<TargetPassConfig>(); 700b57cec5SDimitry Andric } 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric void visitAtomicRMWInst(AtomicRMWInst &I); 730b57cec5SDimitry Andric void visitIntrinsicInst(IntrinsicInst &I); 740b57cec5SDimitry Andric }; 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric } // namespace 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric char AMDGPUAtomicOptimizer::ID = 0; 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 810b57cec5SDimitry Andric 820b57cec5SDimitry Andric bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 830b57cec5SDimitry Andric if (skipFunction(F)) { 840b57cec5SDimitry Andric return false; 850b57cec5SDimitry Andric } 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 880b57cec5SDimitry Andric DL = &F.getParent()->getDataLayout(); 890b57cec5SDimitry Andric DominatorTreeWrapperPass *const DTW = 900b57cec5SDimitry Andric getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 910b57cec5SDimitry Andric DT = DTW ? &DTW->getDomTree() : nullptr; 920b57cec5SDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 930b57cec5SDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>(); 948bcb0991SDimitry Andric ST = &TM.getSubtarget<GCNSubtarget>(F); 950b57cec5SDimitry Andric IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 960b57cec5SDimitry Andric 970b57cec5SDimitry Andric visit(F); 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric const bool Changed = !ToReplace.empty(); 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric for (ReplacementInfo &Info : ToReplace) { 1020b57cec5SDimitry Andric optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 1030b57cec5SDimitry Andric } 1040b57cec5SDimitry Andric 1050b57cec5SDimitry Andric ToReplace.clear(); 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric return Changed; 1080b57cec5SDimitry Andric } 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 1110b57cec5SDimitry Andric // Early exit for unhandled address space atomic instructions. 1120b57cec5SDimitry Andric switch (I.getPointerAddressSpace()) { 1130b57cec5SDimitry Andric default: 1140b57cec5SDimitry Andric return; 1150b57cec5SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 1160b57cec5SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 1170b57cec5SDimitry Andric break; 1180b57cec5SDimitry Andric } 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric AtomicRMWInst::BinOp Op = I.getOperation(); 1210b57cec5SDimitry Andric 1220b57cec5SDimitry Andric switch (Op) { 1230b57cec5SDimitry Andric default: 1240b57cec5SDimitry Andric return; 1250b57cec5SDimitry Andric case AtomicRMWInst::Add: 1260b57cec5SDimitry Andric case AtomicRMWInst::Sub: 1270b57cec5SDimitry Andric case AtomicRMWInst::And: 1280b57cec5SDimitry Andric case AtomicRMWInst::Or: 1290b57cec5SDimitry Andric case AtomicRMWInst::Xor: 1300b57cec5SDimitry Andric case AtomicRMWInst::Max: 1310b57cec5SDimitry Andric case AtomicRMWInst::Min: 1320b57cec5SDimitry Andric case AtomicRMWInst::UMax: 1330b57cec5SDimitry Andric case AtomicRMWInst::UMin: 1340b57cec5SDimitry Andric break; 1350b57cec5SDimitry Andric } 1360b57cec5SDimitry Andric 1370b57cec5SDimitry Andric const unsigned PtrIdx = 0; 1380b57cec5SDimitry Andric const unsigned ValIdx = 1; 1390b57cec5SDimitry Andric 1400b57cec5SDimitry Andric // If the pointer operand is divergent, then each lane is doing an atomic 1410b57cec5SDimitry Andric // operation on a different address, and we cannot optimize that. 1428bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { 1430b57cec5SDimitry Andric return; 1440b57cec5SDimitry Andric } 1450b57cec5SDimitry Andric 1468bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 1470b57cec5SDimitry Andric 1480b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 1490b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 1500b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 1510b57cec5SDimitry Andric // bits. 1528bcb0991SDimitry Andric if (ValDivergent && 1538bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 1540b57cec5SDimitry Andric return; 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 1580b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 1590b57cec5SDimitry Andric // remember the instruction so we can come back to it. 1600b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 1610b57cec5SDimitry Andric 1620b57cec5SDimitry Andric ToReplace.push_back(Info); 1630b57cec5SDimitry Andric } 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 1660b57cec5SDimitry Andric AtomicRMWInst::BinOp Op; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 1690b57cec5SDimitry Andric default: 1700b57cec5SDimitry Andric return; 1710b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_add: 1720b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 1730b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 1740b57cec5SDimitry Andric Op = AtomicRMWInst::Add; 1750b57cec5SDimitry Andric break; 1760b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_sub: 1770b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 1780b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 1790b57cec5SDimitry Andric Op = AtomicRMWInst::Sub; 1800b57cec5SDimitry Andric break; 1810b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_and: 1820b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 1830b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 1840b57cec5SDimitry Andric Op = AtomicRMWInst::And; 1850b57cec5SDimitry Andric break; 1860b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_or: 1870b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 1880b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 1890b57cec5SDimitry Andric Op = AtomicRMWInst::Or; 1900b57cec5SDimitry Andric break; 1910b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_xor: 1920b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 1930b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 1940b57cec5SDimitry Andric Op = AtomicRMWInst::Xor; 1950b57cec5SDimitry Andric break; 1960b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smin: 1970b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 1980b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 1990b57cec5SDimitry Andric Op = AtomicRMWInst::Min; 2000b57cec5SDimitry Andric break; 2010b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umin: 2020b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2030b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2040b57cec5SDimitry Andric Op = AtomicRMWInst::UMin; 2050b57cec5SDimitry Andric break; 2060b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_smax: 2070b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2080b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2090b57cec5SDimitry Andric Op = AtomicRMWInst::Max; 2100b57cec5SDimitry Andric break; 2110b57cec5SDimitry Andric case Intrinsic::amdgcn_buffer_atomic_umax: 2120b57cec5SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2130b57cec5SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2140b57cec5SDimitry Andric Op = AtomicRMWInst::UMax; 2150b57cec5SDimitry Andric break; 2160b57cec5SDimitry Andric } 2170b57cec5SDimitry Andric 2180b57cec5SDimitry Andric const unsigned ValIdx = 0; 2190b57cec5SDimitry Andric 2208bcb0991SDimitry Andric const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 2210b57cec5SDimitry Andric 2220b57cec5SDimitry Andric // If the value operand is divergent, each lane is contributing a different 2230b57cec5SDimitry Andric // value to the atomic calculation. We can only optimize divergent values if 2240b57cec5SDimitry Andric // we have DPP available on our subtarget, and the atomic operation is 32 2250b57cec5SDimitry Andric // bits. 2268bcb0991SDimitry Andric if (ValDivergent && 2278bcb0991SDimitry Andric (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 2280b57cec5SDimitry Andric return; 2290b57cec5SDimitry Andric } 2300b57cec5SDimitry Andric 2310b57cec5SDimitry Andric // If any of the other arguments to the intrinsic are divergent, we can't 2320b57cec5SDimitry Andric // optimize the operation. 2330b57cec5SDimitry Andric for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 2348bcb0991SDimitry Andric if (DA->isDivergentUse(&I.getOperandUse(Idx))) { 2350b57cec5SDimitry Andric return; 2360b57cec5SDimitry Andric } 2370b57cec5SDimitry Andric } 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric // If we get here, we can optimize the atomic using a single wavefront-wide 2400b57cec5SDimitry Andric // atomic operation to do the calculation for the entire wavefront, so 2410b57cec5SDimitry Andric // remember the instruction so we can come back to it. 2420b57cec5SDimitry Andric const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric ToReplace.push_back(Info); 2450b57cec5SDimitry Andric } 2460b57cec5SDimitry Andric 2470b57cec5SDimitry Andric // Use the builder to create the non-atomic counterpart of the specified 2480b57cec5SDimitry Andric // atomicrmw binary op. 2490b57cec5SDimitry Andric static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 2500b57cec5SDimitry Andric Value *LHS, Value *RHS) { 2510b57cec5SDimitry Andric CmpInst::Predicate Pred; 2520b57cec5SDimitry Andric 2530b57cec5SDimitry Andric switch (Op) { 2540b57cec5SDimitry Andric default: 2550b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 2560b57cec5SDimitry Andric case AtomicRMWInst::Add: 2570b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Add, LHS, RHS); 2580b57cec5SDimitry Andric case AtomicRMWInst::Sub: 2590b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Sub, LHS, RHS); 2600b57cec5SDimitry Andric case AtomicRMWInst::And: 2610b57cec5SDimitry Andric return B.CreateBinOp(Instruction::And, LHS, RHS); 2620b57cec5SDimitry Andric case AtomicRMWInst::Or: 2630b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Or, LHS, RHS); 2640b57cec5SDimitry Andric case AtomicRMWInst::Xor: 2650b57cec5SDimitry Andric return B.CreateBinOp(Instruction::Xor, LHS, RHS); 2660b57cec5SDimitry Andric 2670b57cec5SDimitry Andric case AtomicRMWInst::Max: 2680b57cec5SDimitry Andric Pred = CmpInst::ICMP_SGT; 2690b57cec5SDimitry Andric break; 2700b57cec5SDimitry Andric case AtomicRMWInst::Min: 2710b57cec5SDimitry Andric Pred = CmpInst::ICMP_SLT; 2720b57cec5SDimitry Andric break; 2730b57cec5SDimitry Andric case AtomicRMWInst::UMax: 2740b57cec5SDimitry Andric Pred = CmpInst::ICMP_UGT; 2750b57cec5SDimitry Andric break; 2760b57cec5SDimitry Andric case AtomicRMWInst::UMin: 2770b57cec5SDimitry Andric Pred = CmpInst::ICMP_ULT; 2780b57cec5SDimitry Andric break; 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric Value *Cond = B.CreateICmp(Pred, LHS, RHS); 2810b57cec5SDimitry Andric return B.CreateSelect(Cond, LHS, RHS); 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 284fe6060f1SDimitry Andric // Use the builder to create a reduction of V across the wavefront, with all 285fe6060f1SDimitry Andric // lanes active, returning the same result in all lanes. 286fe6060f1SDimitry Andric Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, 287fe6060f1SDimitry Andric AtomicRMWInst::BinOp Op, Value *V, 288fe6060f1SDimitry Andric Value *const Identity) const { 289fe6060f1SDimitry Andric Type *const Ty = V->getType(); 290fe6060f1SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 291fe6060f1SDimitry Andric Function *UpdateDPP = 292fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 293fe6060f1SDimitry Andric 294fe6060f1SDimitry Andric // Reduce within each row of 16 lanes. 295fe6060f1SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 296fe6060f1SDimitry Andric V = buildNonAtomicBinOp( 297fe6060f1SDimitry Andric B, Op, V, 298fe6060f1SDimitry Andric B.CreateCall(UpdateDPP, 299fe6060f1SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 300fe6060f1SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 301fe6060f1SDimitry Andric } 302fe6060f1SDimitry Andric 303fe6060f1SDimitry Andric // Reduce within each pair of rows (i.e. 32 lanes). 304fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 305fe6060f1SDimitry Andric V = buildNonAtomicBinOp( 306fe6060f1SDimitry Andric B, Op, V, 307fe6060f1SDimitry Andric B.CreateIntrinsic( 308fe6060f1SDimitry Andric Intrinsic::amdgcn_permlanex16, {}, 309fe6060f1SDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 310fe6060f1SDimitry Andric 311fe6060f1SDimitry Andric if (ST->isWave32()) 312fe6060f1SDimitry Andric return V; 313fe6060f1SDimitry Andric 314*81ad6265SDimitry Andric if (ST->hasPermLane64()) { 315*81ad6265SDimitry Andric // Reduce across the upper and lower 32 lanes. 316*81ad6265SDimitry Andric return buildNonAtomicBinOp( 317*81ad6265SDimitry Andric B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); 318*81ad6265SDimitry Andric } 319*81ad6265SDimitry Andric 320fe6060f1SDimitry Andric // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 321fe6060f1SDimitry Andric // combine them with a scalar operation. 322fe6060f1SDimitry Andric Function *ReadLane = 323fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 324fe6060f1SDimitry Andric Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 325fe6060f1SDimitry Andric Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 326fe6060f1SDimitry Andric return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 327fe6060f1SDimitry Andric } 328fe6060f1SDimitry Andric 3298bcb0991SDimitry Andric // Use the builder to create an inclusive scan of V across the wavefront, with 3308bcb0991SDimitry Andric // all lanes active. 3318bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 3328bcb0991SDimitry Andric Value *V, Value *const Identity) const { 3338bcb0991SDimitry Andric Type *const Ty = V->getType(); 3348bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 3358bcb0991SDimitry Andric Function *UpdateDPP = 3368bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 3378bcb0991SDimitry Andric 3388bcb0991SDimitry Andric for (unsigned Idx = 0; Idx < 4; Idx++) { 3398bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3408bcb0991SDimitry Andric B, Op, V, 3418bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3428bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 3438bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 3448bcb0991SDimitry Andric } 3458bcb0991SDimitry Andric if (ST->hasDPPBroadcasts()) { 3468bcb0991SDimitry Andric // GFX9 has DPP row broadcast operations. 3478bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3488bcb0991SDimitry Andric B, Op, V, 3498bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3508bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 3518bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 3528bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3538bcb0991SDimitry Andric B, Op, V, 3548bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3558bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 3568bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()})); 3578bcb0991SDimitry Andric } else { 3588bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 3598bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 3608bcb0991SDimitry Andric 3618bcb0991SDimitry Andric // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 3628bcb0991SDimitry Andric // 48..63). 363fe6060f1SDimitry Andric assert(ST->hasPermLaneX16()); 364fe6060f1SDimitry Andric Value *const PermX = B.CreateIntrinsic( 365fe6060f1SDimitry Andric Intrinsic::amdgcn_permlanex16, {}, 366fe6060f1SDimitry Andric {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 3678bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3688bcb0991SDimitry Andric B, Op, V, 3698bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3708bcb0991SDimitry Andric {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 3718bcb0991SDimitry Andric B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 3728bcb0991SDimitry Andric if (!ST->isWave32()) { 3738bcb0991SDimitry Andric // Combine lane 31 into lanes 32..63. 374fe6060f1SDimitry Andric Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 375fe6060f1SDimitry Andric {V, B.getInt32(31)}); 3768bcb0991SDimitry Andric V = buildNonAtomicBinOp( 3778bcb0991SDimitry Andric B, Op, V, 3788bcb0991SDimitry Andric B.CreateCall(UpdateDPP, 3798bcb0991SDimitry Andric {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 3808bcb0991SDimitry Andric B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 3818bcb0991SDimitry Andric } 3828bcb0991SDimitry Andric } 3838bcb0991SDimitry Andric return V; 3848bcb0991SDimitry Andric } 3858bcb0991SDimitry Andric 3868bcb0991SDimitry Andric // Use the builder to create a shift right of V across the wavefront, with all 3878bcb0991SDimitry Andric // lanes active, to turn an inclusive scan into an exclusive scan. 3888bcb0991SDimitry Andric Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, 3898bcb0991SDimitry Andric Value *const Identity) const { 3908bcb0991SDimitry Andric Type *const Ty = V->getType(); 3918bcb0991SDimitry Andric Module *M = B.GetInsertBlock()->getModule(); 3928bcb0991SDimitry Andric Function *UpdateDPP = 3938bcb0991SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 3948bcb0991SDimitry Andric 3958bcb0991SDimitry Andric if (ST->hasDPPWavefrontShifts()) { 3968bcb0991SDimitry Andric // GFX9 has DPP wavefront shift operations. 3978bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 3988bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 3998bcb0991SDimitry Andric B.getInt32(0xf), B.getFalse()}); 4008bcb0991SDimitry Andric } else { 401fe6060f1SDimitry Andric Function *ReadLane = 402fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 403fe6060f1SDimitry Andric Function *WriteLane = 404fe6060f1SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 405fe6060f1SDimitry Andric 4068bcb0991SDimitry Andric // On GFX10 all DPP operations are confined to a single row. To get cross- 4078bcb0991SDimitry Andric // row operations we have to use permlane or readlane. 4088bcb0991SDimitry Andric Value *Old = V; 4098bcb0991SDimitry Andric V = B.CreateCall(UpdateDPP, 4108bcb0991SDimitry Andric {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 4118bcb0991SDimitry Andric B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 4128bcb0991SDimitry Andric 4138bcb0991SDimitry Andric // Copy the old lane 15 to the new lane 16. 4148bcb0991SDimitry Andric V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 4158bcb0991SDimitry Andric B.getInt32(16), V}); 4168bcb0991SDimitry Andric 4178bcb0991SDimitry Andric if (!ST->isWave32()) { 4188bcb0991SDimitry Andric // Copy the old lane 31 to the new lane 32. 4198bcb0991SDimitry Andric V = B.CreateCall( 4208bcb0991SDimitry Andric WriteLane, 4218bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 4228bcb0991SDimitry Andric 4238bcb0991SDimitry Andric // Copy the old lane 47 to the new lane 48. 4248bcb0991SDimitry Andric V = B.CreateCall( 4258bcb0991SDimitry Andric WriteLane, 4268bcb0991SDimitry Andric {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 4278bcb0991SDimitry Andric } 4288bcb0991SDimitry Andric } 4298bcb0991SDimitry Andric 4308bcb0991SDimitry Andric return V; 4318bcb0991SDimitry Andric } 4328bcb0991SDimitry Andric 4330b57cec5SDimitry Andric static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 4340b57cec5SDimitry Andric unsigned BitWidth) { 4350b57cec5SDimitry Andric switch (Op) { 4360b57cec5SDimitry Andric default: 4370b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 4380b57cec5SDimitry Andric case AtomicRMWInst::Add: 4390b57cec5SDimitry Andric case AtomicRMWInst::Sub: 4400b57cec5SDimitry Andric case AtomicRMWInst::Or: 4410b57cec5SDimitry Andric case AtomicRMWInst::Xor: 4420b57cec5SDimitry Andric case AtomicRMWInst::UMax: 4430b57cec5SDimitry Andric return APInt::getMinValue(BitWidth); 4440b57cec5SDimitry Andric case AtomicRMWInst::And: 4450b57cec5SDimitry Andric case AtomicRMWInst::UMin: 4460b57cec5SDimitry Andric return APInt::getMaxValue(BitWidth); 4470b57cec5SDimitry Andric case AtomicRMWInst::Max: 4480b57cec5SDimitry Andric return APInt::getSignedMinValue(BitWidth); 4490b57cec5SDimitry Andric case AtomicRMWInst::Min: 4500b57cec5SDimitry Andric return APInt::getSignedMaxValue(BitWidth); 4510b57cec5SDimitry Andric } 4520b57cec5SDimitry Andric } 4530b57cec5SDimitry Andric 454e8d8bef9SDimitry Andric static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 455e8d8bef9SDimitry Andric const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 456e8d8bef9SDimitry Andric return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 457e8d8bef9SDimitry Andric } 458e8d8bef9SDimitry Andric 4590b57cec5SDimitry Andric void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 4600b57cec5SDimitry Andric AtomicRMWInst::BinOp Op, 4610b57cec5SDimitry Andric unsigned ValIdx, 4620b57cec5SDimitry Andric bool ValDivergent) const { 4630b57cec5SDimitry Andric // Start building just before the instruction. 4640b57cec5SDimitry Andric IRBuilder<> B(&I); 4650b57cec5SDimitry Andric 4660b57cec5SDimitry Andric // If we are in a pixel shader, because of how we have to mask out helper 4670b57cec5SDimitry Andric // lane invocations, we need to record the entry and exit BB's. 4680b57cec5SDimitry Andric BasicBlock *PixelEntryBB = nullptr; 4690b57cec5SDimitry Andric BasicBlock *PixelExitBB = nullptr; 4700b57cec5SDimitry Andric 4710b57cec5SDimitry Andric // If we're optimizing an atomic within a pixel shader, we need to wrap the 4720b57cec5SDimitry Andric // entire atomic operation in a helper-lane check. We do not want any helper 4730b57cec5SDimitry Andric // lanes that are around only for the purposes of derivatives to take part 4740b57cec5SDimitry Andric // in any cross-lane communication, and we use a branch on whether the lane is 4750b57cec5SDimitry Andric // live to do this. 4760b57cec5SDimitry Andric if (IsPixelShader) { 4770b57cec5SDimitry Andric // Record I's original position as the entry block. 4780b57cec5SDimitry Andric PixelEntryBB = I.getParent(); 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 4810b57cec5SDimitry Andric Instruction *const NonHelperTerminator = 4820b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric // Record I's new position as the exit block. 4850b57cec5SDimitry Andric PixelExitBB = I.getParent(); 4860b57cec5SDimitry Andric 4870b57cec5SDimitry Andric I.moveBefore(NonHelperTerminator); 4880b57cec5SDimitry Andric B.SetInsertPoint(&I); 4890b57cec5SDimitry Andric } 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric Type *const Ty = I.getType(); 4920b57cec5SDimitry Andric const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 4935ffd83dbSDimitry Andric auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric // This is the value in the atomic operation we need to combine in order to 4960b57cec5SDimitry Andric // reduce the number of atomic operations. 4970b57cec5SDimitry Andric Value *const V = I.getOperand(ValIdx); 4980b57cec5SDimitry Andric 4990b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront, and we do 5000b57cec5SDimitry Andric // this by doing a ballot of active lanes. 5018bcb0991SDimitry Andric Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 5025ffd83dbSDimitry Andric CallInst *const Ballot = 5035ffd83dbSDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 5040b57cec5SDimitry Andric 5050b57cec5SDimitry Andric // We need to know how many lanes are active within the wavefront that are 5060b57cec5SDimitry Andric // below us. If we counted each lane linearly starting from 0, a lane is 5070b57cec5SDimitry Andric // below us only if its associated index was less than ours. We do this by 5080b57cec5SDimitry Andric // using the mbcnt intrinsic. 5098bcb0991SDimitry Andric Value *Mbcnt; 5108bcb0991SDimitry Andric if (ST->isWave32()) { 5118bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 5128bcb0991SDimitry Andric {Ballot, B.getInt32(0)}); 5138bcb0991SDimitry Andric } else { 5140b57cec5SDimitry Andric Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 5150b57cec5SDimitry Andric Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 5160b57cec5SDimitry Andric Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 5178bcb0991SDimitry Andric Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 5188bcb0991SDimitry Andric {ExtractLo, B.getInt32(0)}); 5198bcb0991SDimitry Andric Mbcnt = 5208bcb0991SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 5218bcb0991SDimitry Andric } 5228bcb0991SDimitry Andric Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 5230b57cec5SDimitry Andric 5240b57cec5SDimitry Andric Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 5250b57cec5SDimitry Andric 5260b57cec5SDimitry Andric Value *ExclScan = nullptr; 5270b57cec5SDimitry Andric Value *NewV = nullptr; 5280b57cec5SDimitry Andric 529fe6060f1SDimitry Andric const bool NeedResult = !I.use_empty(); 530fe6060f1SDimitry Andric 5310b57cec5SDimitry Andric // If we have a divergent value in each lane, we need to combine the value 5320b57cec5SDimitry Andric // using DPP. 5330b57cec5SDimitry Andric if (ValDivergent) { 5340b57cec5SDimitry Andric // First we need to set all inactive invocations to the identity value, so 5350b57cec5SDimitry Andric // that they can correctly contribute to the final result. 5368bcb0991SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 5370b57cec5SDimitry Andric 5388bcb0991SDimitry Andric const AtomicRMWInst::BinOp ScanOp = 5398bcb0991SDimitry Andric Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 540fe6060f1SDimitry Andric if (!NeedResult && ST->hasPermLaneX16()) { 541fe6060f1SDimitry Andric // On GFX10 the permlanex16 instruction helps us build a reduction without 542fe6060f1SDimitry Andric // too many readlanes and writelanes, which are generally bad for 543fe6060f1SDimitry Andric // performance. 544fe6060f1SDimitry Andric NewV = buildReduction(B, ScanOp, NewV, Identity); 545fe6060f1SDimitry Andric } else { 5468bcb0991SDimitry Andric NewV = buildScan(B, ScanOp, NewV, Identity); 547fe6060f1SDimitry Andric if (NeedResult) 5488bcb0991SDimitry Andric ExclScan = buildShiftRight(B, NewV, Identity); 5490b57cec5SDimitry Andric 550349cc55cSDimitry Andric // Read the value from the last lane, which has accumulated the values of 5510b57cec5SDimitry Andric // each active lane in the wavefront. This will be our new value which we 5520b57cec5SDimitry Andric // will provide to the atomic operation. 5538bcb0991SDimitry Andric Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 554fe6060f1SDimitry Andric assert(TyBitWidth == 32); 5550b57cec5SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 5568bcb0991SDimitry Andric {NewV, LastLaneIdx}); 5570b57cec5SDimitry Andric } 5580b57cec5SDimitry Andric 5590b57cec5SDimitry Andric // Finally mark the readlanes in the WWM section. 560fe6060f1SDimitry Andric NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 5610b57cec5SDimitry Andric } else { 5620b57cec5SDimitry Andric switch (Op) { 5630b57cec5SDimitry Andric default: 5640b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 5650b57cec5SDimitry Andric 5660b57cec5SDimitry Andric case AtomicRMWInst::Add: 5670b57cec5SDimitry Andric case AtomicRMWInst::Sub: { 5680b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5690b57cec5SDimitry Andric // old value times the number of active lanes. 5700b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5710b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 572e8d8bef9SDimitry Andric NewV = buildMul(B, V, Ctpop); 5730b57cec5SDimitry Andric break; 5740b57cec5SDimitry Andric } 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric case AtomicRMWInst::And: 5770b57cec5SDimitry Andric case AtomicRMWInst::Or: 5780b57cec5SDimitry Andric case AtomicRMWInst::Max: 5790b57cec5SDimitry Andric case AtomicRMWInst::Min: 5800b57cec5SDimitry Andric case AtomicRMWInst::UMax: 5810b57cec5SDimitry Andric case AtomicRMWInst::UMin: 5820b57cec5SDimitry Andric // These operations with a uniform value are idempotent: doing the atomic 5830b57cec5SDimitry Andric // operation multiple times has the same effect as doing it once. 5840b57cec5SDimitry Andric NewV = V; 5850b57cec5SDimitry Andric break; 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric case AtomicRMWInst::Xor: 5880b57cec5SDimitry Andric // The new value we will be contributing to the atomic operation is the 5890b57cec5SDimitry Andric // old value times the parity of the number of active lanes. 5900b57cec5SDimitry Andric Value *const Ctpop = B.CreateIntCast( 5910b57cec5SDimitry Andric B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 592e8d8bef9SDimitry Andric NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 5930b57cec5SDimitry Andric break; 5940b57cec5SDimitry Andric } 5950b57cec5SDimitry Andric } 5960b57cec5SDimitry Andric 5970b57cec5SDimitry Andric // We only want a single lane to enter our new control flow, and we do this 5980b57cec5SDimitry Andric // by checking if there are any active lanes below us. Only one lane will 5990b57cec5SDimitry Andric // have 0 active lanes below us, so that will be the only one to progress. 6000b57cec5SDimitry Andric Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 6010b57cec5SDimitry Andric 6020b57cec5SDimitry Andric // Store I's original basic block before we split the block. 6030b57cec5SDimitry Andric BasicBlock *const EntryBB = I.getParent(); 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric // We need to introduce some new control flow to force a single lane to be 6060b57cec5SDimitry Andric // active. We do this by splitting I's basic block at I, and introducing the 6070b57cec5SDimitry Andric // new block such that: 6080b57cec5SDimitry Andric // entry --> single_lane -\ 6090b57cec5SDimitry Andric // \------------------> exit 6100b57cec5SDimitry Andric Instruction *const SingleLaneTerminator = 6110b57cec5SDimitry Andric SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 6120b57cec5SDimitry Andric 6130b57cec5SDimitry Andric // Move the IR builder into single_lane next. 6140b57cec5SDimitry Andric B.SetInsertPoint(SingleLaneTerminator); 6150b57cec5SDimitry Andric 6160b57cec5SDimitry Andric // Clone the original atomic operation into single lane, replacing the 6170b57cec5SDimitry Andric // original value with our newly created one. 6180b57cec5SDimitry Andric Instruction *const NewI = I.clone(); 6190b57cec5SDimitry Andric B.Insert(NewI); 6200b57cec5SDimitry Andric NewI->setOperand(ValIdx, NewV); 6210b57cec5SDimitry Andric 6220b57cec5SDimitry Andric // Move the IR builder into exit next, and start inserting just before the 6230b57cec5SDimitry Andric // original instruction. 6240b57cec5SDimitry Andric B.SetInsertPoint(&I); 6250b57cec5SDimitry Andric 6268bcb0991SDimitry Andric if (NeedResult) { 6270b57cec5SDimitry Andric // Create a PHI node to get our new atomic result into the exit block. 6280b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 6290b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), EntryBB); 6300b57cec5SDimitry Andric PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 6310b57cec5SDimitry Andric 6320b57cec5SDimitry Andric // We need to broadcast the value who was the lowest active lane (the first 6330b57cec5SDimitry Andric // lane) to all other lanes in the wavefront. We use an intrinsic for this, 6340b57cec5SDimitry Andric // but have to handle 64-bit broadcasts with two calls to this intrinsic. 6350b57cec5SDimitry Andric Value *BroadcastI = nullptr; 6360b57cec5SDimitry Andric 6370b57cec5SDimitry Andric if (TyBitWidth == 64) { 6380b57cec5SDimitry Andric Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 6390b57cec5SDimitry Andric Value *const ExtractHi = 6408bcb0991SDimitry Andric B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 6410b57cec5SDimitry Andric CallInst *const ReadFirstLaneLo = 6420b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 6430b57cec5SDimitry Andric CallInst *const ReadFirstLaneHi = 6440b57cec5SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 6450b57cec5SDimitry Andric Value *const PartialInsert = B.CreateInsertElement( 6460b57cec5SDimitry Andric UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 6470b57cec5SDimitry Andric Value *const Insert = 6480b57cec5SDimitry Andric B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 6490b57cec5SDimitry Andric BroadcastI = B.CreateBitCast(Insert, Ty); 6500b57cec5SDimitry Andric } else if (TyBitWidth == 32) { 6510b57cec5SDimitry Andric 6520b57cec5SDimitry Andric BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 6530b57cec5SDimitry Andric } else { 6540b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic bit width"); 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric // Now that we have the result of our single atomic operation, we need to 6588bcb0991SDimitry Andric // get our individual lane's slice into the result. We use the lane offset 6598bcb0991SDimitry Andric // we previously calculated combined with the atomic result value we got 6608bcb0991SDimitry Andric // from the first lane, to get our lane's index into the atomic result. 6610b57cec5SDimitry Andric Value *LaneOffset = nullptr; 6620b57cec5SDimitry Andric if (ValDivergent) { 663fe6060f1SDimitry Andric LaneOffset = 664fe6060f1SDimitry Andric B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 6650b57cec5SDimitry Andric } else { 6660b57cec5SDimitry Andric switch (Op) { 6670b57cec5SDimitry Andric default: 6680b57cec5SDimitry Andric llvm_unreachable("Unhandled atomic op"); 6690b57cec5SDimitry Andric case AtomicRMWInst::Add: 6700b57cec5SDimitry Andric case AtomicRMWInst::Sub: 671e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, Mbcnt); 6720b57cec5SDimitry Andric break; 6730b57cec5SDimitry Andric case AtomicRMWInst::And: 6740b57cec5SDimitry Andric case AtomicRMWInst::Or: 6750b57cec5SDimitry Andric case AtomicRMWInst::Max: 6760b57cec5SDimitry Andric case AtomicRMWInst::Min: 6770b57cec5SDimitry Andric case AtomicRMWInst::UMax: 6780b57cec5SDimitry Andric case AtomicRMWInst::UMin: 6790b57cec5SDimitry Andric LaneOffset = B.CreateSelect(Cond, Identity, V); 6800b57cec5SDimitry Andric break; 6810b57cec5SDimitry Andric case AtomicRMWInst::Xor: 682e8d8bef9SDimitry Andric LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 6830b57cec5SDimitry Andric break; 6840b57cec5SDimitry Andric } 6850b57cec5SDimitry Andric } 6860b57cec5SDimitry Andric Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric if (IsPixelShader) { 6890b57cec5SDimitry Andric // Need a final PHI to reconverge to above the helper lane branch mask. 6900b57cec5SDimitry Andric B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 6910b57cec5SDimitry Andric 6920b57cec5SDimitry Andric PHINode *const PHI = B.CreatePHI(Ty, 2); 6930b57cec5SDimitry Andric PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 6940b57cec5SDimitry Andric PHI->addIncoming(Result, I.getParent()); 6950b57cec5SDimitry Andric I.replaceAllUsesWith(PHI); 6960b57cec5SDimitry Andric } else { 6970b57cec5SDimitry Andric // Replace the original atomic instruction with the new one. 6980b57cec5SDimitry Andric I.replaceAllUsesWith(Result); 6990b57cec5SDimitry Andric } 7008bcb0991SDimitry Andric } 7010b57cec5SDimitry Andric 7020b57cec5SDimitry Andric // And delete the original. 7030b57cec5SDimitry Andric I.eraseFromParent(); 7040b57cec5SDimitry Andric } 7050b57cec5SDimitry Andric 7060b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 7070b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 7080b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 7090b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 7100b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 7110b57cec5SDimitry Andric "AMDGPU atomic optimizations", false, false) 7120b57cec5SDimitry Andric 7130b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 7140b57cec5SDimitry Andric return new AMDGPUAtomicOptimizer(); 7150b57cec5SDimitry Andric } 716