1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getParent()->getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getParent()->getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 // Scan option None disables the Pass 164 if (ScanImpl == ScanOptions::None) { 165 return false; 166 } 167 168 visit(F); 169 170 const bool Changed = !ToReplace.empty(); 171 172 for (ReplacementInfo &Info : ToReplace) { 173 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 174 } 175 176 ToReplace.clear(); 177 178 return Changed; 179 } 180 181 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 182 // Early exit for unhandled address space atomic instructions. 183 switch (I.getPointerAddressSpace()) { 184 default: 185 return; 186 case AMDGPUAS::GLOBAL_ADDRESS: 187 case AMDGPUAS::LOCAL_ADDRESS: 188 break; 189 } 190 191 AtomicRMWInst::BinOp Op = I.getOperation(); 192 193 switch (Op) { 194 default: 195 return; 196 case AtomicRMWInst::Add: 197 case AtomicRMWInst::Sub: 198 case AtomicRMWInst::And: 199 case AtomicRMWInst::Or: 200 case AtomicRMWInst::Xor: 201 case AtomicRMWInst::Max: 202 case AtomicRMWInst::Min: 203 case AtomicRMWInst::UMax: 204 case AtomicRMWInst::UMin: 205 break; 206 } 207 208 const unsigned PtrIdx = 0; 209 const unsigned ValIdx = 1; 210 211 // If the pointer operand is divergent, then each lane is doing an atomic 212 // operation on a different address, and we cannot optimize that. 213 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 214 return; 215 } 216 217 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 218 219 // If the value operand is divergent, each lane is contributing a different 220 // value to the atomic calculation. We can only optimize divergent values if 221 // we have DPP available on our subtarget, and the atomic operation is 32 222 // bits. 223 if (ValDivergent && 224 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 225 return; 226 } 227 228 // If we get here, we can optimize the atomic using a single wavefront-wide 229 // atomic operation to do the calculation for the entire wavefront, so 230 // remember the instruction so we can come back to it. 231 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 232 233 ToReplace.push_back(Info); 234 } 235 236 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 237 AtomicRMWInst::BinOp Op; 238 239 switch (I.getIntrinsicID()) { 240 default: 241 return; 242 case Intrinsic::amdgcn_buffer_atomic_add: 243 case Intrinsic::amdgcn_struct_buffer_atomic_add: 244 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 245 case Intrinsic::amdgcn_raw_buffer_atomic_add: 246 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 247 Op = AtomicRMWInst::Add; 248 break; 249 case Intrinsic::amdgcn_buffer_atomic_sub: 250 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 251 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 252 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 254 Op = AtomicRMWInst::Sub; 255 break; 256 case Intrinsic::amdgcn_buffer_atomic_and: 257 case Intrinsic::amdgcn_struct_buffer_atomic_and: 258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 259 case Intrinsic::amdgcn_raw_buffer_atomic_and: 260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 261 Op = AtomicRMWInst::And; 262 break; 263 case Intrinsic::amdgcn_buffer_atomic_or: 264 case Intrinsic::amdgcn_struct_buffer_atomic_or: 265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 266 case Intrinsic::amdgcn_raw_buffer_atomic_or: 267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 268 Op = AtomicRMWInst::Or; 269 break; 270 case Intrinsic::amdgcn_buffer_atomic_xor: 271 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 272 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 273 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 274 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 275 Op = AtomicRMWInst::Xor; 276 break; 277 case Intrinsic::amdgcn_buffer_atomic_smin: 278 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 280 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 282 Op = AtomicRMWInst::Min; 283 break; 284 case Intrinsic::amdgcn_buffer_atomic_umin: 285 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 286 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 287 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 288 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 289 Op = AtomicRMWInst::UMin; 290 break; 291 case Intrinsic::amdgcn_buffer_atomic_smax: 292 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 294 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 296 Op = AtomicRMWInst::Max; 297 break; 298 case Intrinsic::amdgcn_buffer_atomic_umax: 299 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 301 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 302 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 303 Op = AtomicRMWInst::UMax; 304 break; 305 } 306 307 const unsigned ValIdx = 0; 308 309 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 310 311 // If the value operand is divergent, each lane is contributing a different 312 // value to the atomic calculation. We can only optimize divergent values if 313 // we have DPP available on our subtarget, and the atomic operation is 32 314 // bits. 315 if (ValDivergent && 316 (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 317 return; 318 } 319 320 // If any of the other arguments to the intrinsic are divergent, we can't 321 // optimize the operation. 322 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 323 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 324 return; 325 } 326 } 327 328 // If we get here, we can optimize the atomic using a single wavefront-wide 329 // atomic operation to do the calculation for the entire wavefront, so 330 // remember the instruction so we can come back to it. 331 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 332 333 ToReplace.push_back(Info); 334 } 335 336 // Use the builder to create the non-atomic counterpart of the specified 337 // atomicrmw binary op. 338 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 339 Value *LHS, Value *RHS) { 340 CmpInst::Predicate Pred; 341 342 switch (Op) { 343 default: 344 llvm_unreachable("Unhandled atomic op"); 345 case AtomicRMWInst::Add: 346 return B.CreateBinOp(Instruction::Add, LHS, RHS); 347 case AtomicRMWInst::Sub: 348 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 349 case AtomicRMWInst::And: 350 return B.CreateBinOp(Instruction::And, LHS, RHS); 351 case AtomicRMWInst::Or: 352 return B.CreateBinOp(Instruction::Or, LHS, RHS); 353 case AtomicRMWInst::Xor: 354 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 355 356 case AtomicRMWInst::Max: 357 Pred = CmpInst::ICMP_SGT; 358 break; 359 case AtomicRMWInst::Min: 360 Pred = CmpInst::ICMP_SLT; 361 break; 362 case AtomicRMWInst::UMax: 363 Pred = CmpInst::ICMP_UGT; 364 break; 365 case AtomicRMWInst::UMin: 366 Pred = CmpInst::ICMP_ULT; 367 break; 368 } 369 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 370 return B.CreateSelect(Cond, LHS, RHS); 371 } 372 373 // Use the builder to create a reduction of V across the wavefront, with all 374 // lanes active, returning the same result in all lanes. 375 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 376 AtomicRMWInst::BinOp Op, 377 Value *V, 378 Value *const Identity) const { 379 Type *const Ty = V->getType(); 380 Module *M = B.GetInsertBlock()->getModule(); 381 Function *UpdateDPP = 382 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 383 384 // Reduce within each row of 16 lanes. 385 for (unsigned Idx = 0; Idx < 4; Idx++) { 386 V = buildNonAtomicBinOp( 387 B, Op, V, 388 B.CreateCall(UpdateDPP, 389 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 390 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 391 } 392 393 // Reduce within each pair of rows (i.e. 32 lanes). 394 assert(ST->hasPermLaneX16()); 395 V = buildNonAtomicBinOp( 396 B, Op, V, 397 B.CreateIntrinsic( 398 Intrinsic::amdgcn_permlanex16, {}, 399 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 400 401 if (ST->isWave32()) 402 return V; 403 404 if (ST->hasPermLane64()) { 405 // Reduce across the upper and lower 32 lanes. 406 return buildNonAtomicBinOp( 407 B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); 408 } 409 410 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 411 // combine them with a scalar operation. 412 Function *ReadLane = 413 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 414 Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 415 Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 416 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 417 } 418 419 // Use the builder to create an inclusive scan of V across the wavefront, with 420 // all lanes active. 421 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 422 AtomicRMWInst::BinOp Op, Value *V, 423 Value *const Identity) const { 424 Type *const Ty = V->getType(); 425 Module *M = B.GetInsertBlock()->getModule(); 426 Function *UpdateDPP = 427 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 428 429 for (unsigned Idx = 0; Idx < 4; Idx++) { 430 V = buildNonAtomicBinOp( 431 B, Op, V, 432 B.CreateCall(UpdateDPP, 433 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 434 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 435 } 436 if (ST->hasDPPBroadcasts()) { 437 // GFX9 has DPP row broadcast operations. 438 V = buildNonAtomicBinOp( 439 B, Op, V, 440 B.CreateCall(UpdateDPP, 441 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 442 B.getInt32(0xf), B.getFalse()})); 443 V = buildNonAtomicBinOp( 444 B, Op, V, 445 B.CreateCall(UpdateDPP, 446 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 447 B.getInt32(0xf), B.getFalse()})); 448 } else { 449 // On GFX10 all DPP operations are confined to a single row. To get cross- 450 // row operations we have to use permlane or readlane. 451 452 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 453 // 48..63). 454 assert(ST->hasPermLaneX16()); 455 Value *const PermX = B.CreateIntrinsic( 456 Intrinsic::amdgcn_permlanex16, {}, 457 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 458 V = buildNonAtomicBinOp( 459 B, Op, V, 460 B.CreateCall(UpdateDPP, 461 {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 462 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 463 if (!ST->isWave32()) { 464 // Combine lane 31 into lanes 32..63. 465 Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 466 {V, B.getInt32(31)}); 467 V = buildNonAtomicBinOp( 468 B, Op, V, 469 B.CreateCall(UpdateDPP, 470 {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 471 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 472 } 473 } 474 return V; 475 } 476 477 // Use the builder to create a shift right of V across the wavefront, with all 478 // lanes active, to turn an inclusive scan into an exclusive scan. 479 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 480 Value *const Identity) const { 481 Type *const Ty = V->getType(); 482 Module *M = B.GetInsertBlock()->getModule(); 483 Function *UpdateDPP = 484 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 485 486 if (ST->hasDPPWavefrontShifts()) { 487 // GFX9 has DPP wavefront shift operations. 488 V = B.CreateCall(UpdateDPP, 489 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 490 B.getInt32(0xf), B.getFalse()}); 491 } else { 492 Function *ReadLane = 493 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 494 Function *WriteLane = 495 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 496 497 // On GFX10 all DPP operations are confined to a single row. To get cross- 498 // row operations we have to use permlane or readlane. 499 Value *Old = V; 500 V = B.CreateCall(UpdateDPP, 501 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 502 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 503 504 // Copy the old lane 15 to the new lane 16. 505 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 506 B.getInt32(16), V}); 507 508 if (!ST->isWave32()) { 509 // Copy the old lane 31 to the new lane 32. 510 V = B.CreateCall( 511 WriteLane, 512 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 513 514 // Copy the old lane 47 to the new lane 48. 515 V = B.CreateCall( 516 WriteLane, 517 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 518 } 519 } 520 521 return V; 522 } 523 524 // Use the builder to create an exclusive scan and compute the final reduced 525 // value using an iterative approach. This provides an alternative 526 // implementation to DPP which uses WMM for scan computations. This API iterate 527 // over active lanes to read, compute and update the value using 528 // readlane and writelane intrinsics. 529 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 530 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 531 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 532 533 auto *Ty = I.getType(); 534 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 535 auto *EntryBB = I.getParent(); 536 auto NeedResult = !I.use_empty(); 537 538 auto *Ballot = 539 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 540 541 // Start inserting instructions for ComputeLoop block 542 B.SetInsertPoint(ComputeLoop); 543 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 544 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 545 Accumulator->addIncoming(Identity, EntryBB); 546 PHINode *OldValuePhi = nullptr; 547 if (NeedResult) { 548 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 549 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 550 } 551 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 552 ActiveBits->addIncoming(Ballot, EntryBB); 553 554 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 555 auto *FF1 = 556 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 557 auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); 558 559 // Get the value required for atomic operation 560 auto *LaneValue = 561 B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); 562 563 // Perform writelane if intermediate scan results are required later in the 564 // kernel computations 565 Value *OldValue = nullptr; 566 if (NeedResult) { 567 OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, 568 {Accumulator, LaneIdxInt, OldValuePhi}); 569 OldValuePhi->addIncoming(OldValue, ComputeLoop); 570 } 571 572 // Accumulate the results 573 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 574 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 575 576 // Set bit to zero of current active lane so that for next iteration llvm.cttz 577 // return the next active lane 578 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 579 580 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 581 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 582 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 583 584 // Branch out of the loop when all lanes are processed. 585 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 586 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 587 588 B.SetInsertPoint(ComputeEnd); 589 590 return {OldValue, NewAccumulator}; 591 } 592 593 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 594 unsigned BitWidth) { 595 switch (Op) { 596 default: 597 llvm_unreachable("Unhandled atomic op"); 598 case AtomicRMWInst::Add: 599 case AtomicRMWInst::Sub: 600 case AtomicRMWInst::Or: 601 case AtomicRMWInst::Xor: 602 case AtomicRMWInst::UMax: 603 return APInt::getMinValue(BitWidth); 604 case AtomicRMWInst::And: 605 case AtomicRMWInst::UMin: 606 return APInt::getMaxValue(BitWidth); 607 case AtomicRMWInst::Max: 608 return APInt::getSignedMinValue(BitWidth); 609 case AtomicRMWInst::Min: 610 return APInt::getSignedMaxValue(BitWidth); 611 } 612 } 613 614 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 615 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 616 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 617 } 618 619 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 620 AtomicRMWInst::BinOp Op, 621 unsigned ValIdx, 622 bool ValDivergent) const { 623 // Start building just before the instruction. 624 IRBuilder<> B(&I); 625 626 // If we are in a pixel shader, because of how we have to mask out helper 627 // lane invocations, we need to record the entry and exit BB's. 628 BasicBlock *PixelEntryBB = nullptr; 629 BasicBlock *PixelExitBB = nullptr; 630 631 // If we're optimizing an atomic within a pixel shader, we need to wrap the 632 // entire atomic operation in a helper-lane check. We do not want any helper 633 // lanes that are around only for the purposes of derivatives to take part 634 // in any cross-lane communication, and we use a branch on whether the lane is 635 // live to do this. 636 if (IsPixelShader) { 637 // Record I's original position as the entry block. 638 PixelEntryBB = I.getParent(); 639 640 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 641 Instruction *const NonHelperTerminator = 642 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 643 644 // Record I's new position as the exit block. 645 PixelExitBB = I.getParent(); 646 647 I.moveBefore(NonHelperTerminator); 648 B.SetInsertPoint(&I); 649 } 650 651 Type *const Ty = I.getType(); 652 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 653 auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 654 655 // This is the value in the atomic operation we need to combine in order to 656 // reduce the number of atomic operations. 657 Value *const V = I.getOperand(ValIdx); 658 659 // We need to know how many lanes are active within the wavefront, and we do 660 // this by doing a ballot of active lanes. 661 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 662 CallInst *const Ballot = 663 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 664 665 // We need to know how many lanes are active within the wavefront that are 666 // below us. If we counted each lane linearly starting from 0, a lane is 667 // below us only if its associated index was less than ours. We do this by 668 // using the mbcnt intrinsic. 669 Value *Mbcnt; 670 if (ST->isWave32()) { 671 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 672 {Ballot, B.getInt32(0)}); 673 } else { 674 Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 675 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 676 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 677 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 678 {ExtractLo, B.getInt32(0)}); 679 Mbcnt = 680 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 681 } 682 Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 683 684 Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 685 686 Value *ExclScan = nullptr; 687 Value *NewV = nullptr; 688 689 const bool NeedResult = !I.use_empty(); 690 691 Function *F = I.getFunction(); 692 LLVMContext &C = F->getContext(); 693 BasicBlock *ComputeLoop = nullptr; 694 BasicBlock *ComputeEnd = nullptr; 695 // If we have a divergent value in each lane, we need to combine the value 696 // using DPP. 697 if (ValDivergent) { 698 const AtomicRMWInst::BinOp ScanOp = 699 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 700 if (ScanImpl == ScanOptions::DPP) { 701 // First we need to set all inactive invocations to the identity value, so 702 // that they can correctly contribute to the final result. 703 NewV = 704 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 705 const AtomicRMWInst::BinOp ScanOp = 706 Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 707 if (!NeedResult && ST->hasPermLaneX16()) { 708 // On GFX10 the permlanex16 instruction helps us build a reduction 709 // without too many readlanes and writelanes, which are generally bad 710 // for performance. 711 NewV = buildReduction(B, ScanOp, NewV, Identity); 712 } else { 713 NewV = buildScan(B, ScanOp, NewV, Identity); 714 if (NeedResult) 715 ExclScan = buildShiftRight(B, NewV, Identity); 716 // Read the value from the last lane, which has accumulated the values 717 // of each active lane in the wavefront. This will be our new value 718 // which we will provide to the atomic operation. 719 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 720 assert(TyBitWidth == 32); 721 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 722 {NewV, LastLaneIdx}); 723 } 724 // Finally mark the readlanes in the WWM section. 725 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 726 } else if (ScanImpl == ScanOptions::Iterative) { 727 // Alternative implementation for scan 728 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 729 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 730 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 731 ComputeLoop, ComputeEnd); 732 } else { 733 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 734 } 735 } else { 736 switch (Op) { 737 default: 738 llvm_unreachable("Unhandled atomic op"); 739 740 case AtomicRMWInst::Add: 741 case AtomicRMWInst::Sub: { 742 // The new value we will be contributing to the atomic operation is the 743 // old value times the number of active lanes. 744 Value *const Ctpop = B.CreateIntCast( 745 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 746 NewV = buildMul(B, V, Ctpop); 747 break; 748 } 749 750 case AtomicRMWInst::And: 751 case AtomicRMWInst::Or: 752 case AtomicRMWInst::Max: 753 case AtomicRMWInst::Min: 754 case AtomicRMWInst::UMax: 755 case AtomicRMWInst::UMin: 756 // These operations with a uniform value are idempotent: doing the atomic 757 // operation multiple times has the same effect as doing it once. 758 NewV = V; 759 break; 760 761 case AtomicRMWInst::Xor: 762 // The new value we will be contributing to the atomic operation is the 763 // old value times the parity of the number of active lanes. 764 Value *const Ctpop = B.CreateIntCast( 765 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 766 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 767 break; 768 } 769 } 770 771 // We only want a single lane to enter our new control flow, and we do this 772 // by checking if there are any active lanes below us. Only one lane will 773 // have 0 active lanes below us, so that will be the only one to progress. 774 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 775 776 // Store I's original basic block before we split the block. 777 BasicBlock *const EntryBB = I.getParent(); 778 779 // We need to introduce some new control flow to force a single lane to be 780 // active. We do this by splitting I's basic block at I, and introducing the 781 // new block such that: 782 // entry --> single_lane -\ 783 // \------------------> exit 784 Instruction *const SingleLaneTerminator = 785 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 786 787 // At this point, we have split the I's block to allow one lane in wavefront 788 // to update the precomputed reduced value. Also, completed the codegen for 789 // new control flow i.e. iterative loop which perform reduction and scan using 790 // ComputeLoop and ComputeEnd. 791 // For the new control flow, we need to move branch instruction i.e. 792 // terminator created during SplitBlockAndInsertIfThen from I's block to 793 // ComputeEnd block. We also need to set up predecessor to next block when 794 // single lane done updating the final reduced value. 795 BasicBlock *Predecessor = nullptr; 796 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 797 // Move terminator from I's block to ComputeEnd block. 798 Instruction *Terminator = EntryBB->getTerminator(); 799 B.SetInsertPoint(ComputeEnd); 800 Terminator->removeFromParent(); 801 B.Insert(Terminator); 802 803 // Branch to ComputeLoop Block unconditionally from the I's block for 804 // iterative approach. 805 B.SetInsertPoint(EntryBB); 806 B.CreateBr(ComputeLoop); 807 808 // Update the dominator tree for new control flow. 809 DTU.applyUpdates( 810 {{DominatorTree::Insert, EntryBB, ComputeLoop}, 811 {DominatorTree::Insert, ComputeLoop, ComputeEnd}, 812 {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}}); 813 814 Predecessor = ComputeEnd; 815 } else { 816 Predecessor = EntryBB; 817 } 818 // Move the IR builder into single_lane next. 819 B.SetInsertPoint(SingleLaneTerminator); 820 821 // Clone the original atomic operation into single lane, replacing the 822 // original value with our newly created one. 823 Instruction *const NewI = I.clone(); 824 B.Insert(NewI); 825 NewI->setOperand(ValIdx, NewV); 826 827 // Move the IR builder into exit next, and start inserting just before the 828 // original instruction. 829 B.SetInsertPoint(&I); 830 831 if (NeedResult) { 832 // Create a PHI node to get our new atomic result into the exit block. 833 PHINode *const PHI = B.CreatePHI(Ty, 2); 834 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 835 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 836 837 // We need to broadcast the value who was the lowest active lane (the first 838 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 839 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 840 Value *BroadcastI = nullptr; 841 842 if (TyBitWidth == 64) { 843 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 844 Value *const ExtractHi = 845 B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 846 CallInst *const ReadFirstLaneLo = 847 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 848 CallInst *const ReadFirstLaneHi = 849 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 850 Value *const PartialInsert = B.CreateInsertElement( 851 PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 852 Value *const Insert = 853 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 854 BroadcastI = B.CreateBitCast(Insert, Ty); 855 } else if (TyBitWidth == 32) { 856 857 BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 858 } else { 859 llvm_unreachable("Unhandled atomic bit width"); 860 } 861 862 // Now that we have the result of our single atomic operation, we need to 863 // get our individual lane's slice into the result. We use the lane offset 864 // we previously calculated combined with the atomic result value we got 865 // from the first lane, to get our lane's index into the atomic result. 866 Value *LaneOffset = nullptr; 867 if (ValDivergent) { 868 if (ScanImpl == ScanOptions::DPP) { 869 LaneOffset = 870 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 871 } else if (ScanImpl == ScanOptions::Iterative) { 872 LaneOffset = ExclScan; 873 } else { 874 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 875 } 876 } else { 877 switch (Op) { 878 default: 879 llvm_unreachable("Unhandled atomic op"); 880 case AtomicRMWInst::Add: 881 case AtomicRMWInst::Sub: 882 LaneOffset = buildMul(B, V, Mbcnt); 883 break; 884 case AtomicRMWInst::And: 885 case AtomicRMWInst::Or: 886 case AtomicRMWInst::Max: 887 case AtomicRMWInst::Min: 888 case AtomicRMWInst::UMax: 889 case AtomicRMWInst::UMin: 890 LaneOffset = B.CreateSelect(Cond, Identity, V); 891 break; 892 case AtomicRMWInst::Xor: 893 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 894 break; 895 } 896 } 897 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 898 899 if (IsPixelShader) { 900 // Need a final PHI to reconverge to above the helper lane branch mask. 901 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 902 903 PHINode *const PHI = B.CreatePHI(Ty, 2); 904 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 905 PHI->addIncoming(Result, I.getParent()); 906 I.replaceAllUsesWith(PHI); 907 } else { 908 // Replace the original atomic instruction with the new one. 909 I.replaceAllUsesWith(Result); 910 } 911 } 912 913 // And delete the original. 914 I.eraseFromParent(); 915 } 916 917 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 918 "AMDGPU atomic optimizations", false, false) 919 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 920 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 921 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 922 "AMDGPU atomic optimizations", false, false) 923 924 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 925 return new AMDGPUAtomicOptimizer(ScanStrategy); 926 } 927