1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 Function &F; 70 SmallVector<ReplacementInfo, 8> ToReplace; 71 const UniformityInfo &UA; 72 const DataLayout &DL; 73 DomTreeUpdater &DTU; 74 const GCNSubtarget &ST; 75 bool IsPixelShader; 76 ScanOptions ScanImpl; 77 78 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 79 Value *const Identity) const; 80 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 81 Value *const Identity) const; 82 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 83 84 std::pair<Value *, Value *> 85 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 86 Value *const Identity, Value *V, Instruction &I, 87 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 88 89 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 90 bool ValDivergent) const; 91 92 public: 93 AMDGPUAtomicOptimizerImpl() = delete; 94 95 AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA, 96 DomTreeUpdater &DTU, const GCNSubtarget &ST, 97 ScanOptions ScanImpl) 98 : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST), 99 IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS), 100 ScanImpl(ScanImpl) {} 101 102 bool run(); 103 104 void visitAtomicRMWInst(AtomicRMWInst &I); 105 void visitIntrinsicInst(IntrinsicInst &I); 106 }; 107 108 } // namespace 109 110 char AMDGPUAtomicOptimizer::ID = 0; 111 112 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 113 114 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 115 if (skipFunction(F)) { 116 return false; 117 } 118 119 const UniformityInfo &UA = 120 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 121 122 DominatorTreeWrapperPass *DTW = 123 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 124 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 125 DomTreeUpdater::UpdateStrategy::Lazy); 126 127 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 128 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 129 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 130 131 return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run(); 132 } 133 134 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 135 FunctionAnalysisManager &AM) { 136 const auto &UA = AM.getResult<UniformityInfoAnalysis>(F); 137 138 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 139 DomTreeUpdater::UpdateStrategy::Lazy); 140 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 141 142 bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run(); 143 144 if (!IsChanged) { 145 return PreservedAnalyses::all(); 146 } 147 148 PreservedAnalyses PA; 149 PA.preserve<DominatorTreeAnalysis>(); 150 return PA; 151 } 152 153 bool AMDGPUAtomicOptimizerImpl::run() { 154 // Scan option None disables the Pass 155 if (ScanImpl == ScanOptions::None) 156 return false; 157 158 visit(F); 159 if (ToReplace.empty()) 160 return false; 161 162 for (auto &[I, Op, ValIdx, ValDivergent] : ToReplace) 163 optimizeAtomic(*I, Op, ValIdx, ValDivergent); 164 ToReplace.clear(); 165 return true; 166 } 167 168 static bool isLegalCrossLaneType(Type *Ty) { 169 switch (Ty->getTypeID()) { 170 case Type::FloatTyID: 171 case Type::DoubleTyID: 172 return true; 173 case Type::IntegerTyID: { 174 unsigned Size = Ty->getIntegerBitWidth(); 175 return (Size == 32 || Size == 64); 176 } 177 default: 178 return false; 179 } 180 } 181 182 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 183 // Early exit for unhandled address space atomic instructions. 184 switch (I.getPointerAddressSpace()) { 185 default: 186 return; 187 case AMDGPUAS::GLOBAL_ADDRESS: 188 case AMDGPUAS::LOCAL_ADDRESS: 189 break; 190 } 191 192 AtomicRMWInst::BinOp Op = I.getOperation(); 193 194 switch (Op) { 195 default: 196 return; 197 case AtomicRMWInst::Add: 198 case AtomicRMWInst::Sub: 199 case AtomicRMWInst::And: 200 case AtomicRMWInst::Or: 201 case AtomicRMWInst::Xor: 202 case AtomicRMWInst::Max: 203 case AtomicRMWInst::Min: 204 case AtomicRMWInst::UMax: 205 case AtomicRMWInst::UMin: 206 case AtomicRMWInst::FAdd: 207 case AtomicRMWInst::FSub: 208 case AtomicRMWInst::FMax: 209 case AtomicRMWInst::FMin: 210 break; 211 } 212 213 // Only 32 and 64 bit floating point atomic ops are supported. 214 if (AtomicRMWInst::isFPOperation(Op) && 215 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 216 return; 217 } 218 219 const unsigned PtrIdx = 0; 220 const unsigned ValIdx = 1; 221 222 // If the pointer operand is divergent, then each lane is doing an atomic 223 // operation on a different address, and we cannot optimize that. 224 if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) { 225 return; 226 } 227 228 bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); 229 230 // If the value operand is divergent, each lane is contributing a different 231 // value to the atomic calculation. We can only optimize divergent values if 232 // we have DPP available on our subtarget (for DPP strategy), and the atomic 233 // operation is 32 or 64 bits. 234 if (ValDivergent) { 235 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP()) 236 return; 237 238 if (!isLegalCrossLaneType(I.getType())) 239 return; 240 } 241 242 // If we get here, we can optimize the atomic using a single wavefront-wide 243 // atomic operation to do the calculation for the entire wavefront, so 244 // remember the instruction so we can come back to it. 245 ToReplace.push_back({&I, Op, ValIdx, ValDivergent}); 246 } 247 248 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 249 AtomicRMWInst::BinOp Op; 250 251 switch (I.getIntrinsicID()) { 252 default: 253 return; 254 case Intrinsic::amdgcn_struct_buffer_atomic_add: 255 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 256 case Intrinsic::amdgcn_raw_buffer_atomic_add: 257 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 258 Op = AtomicRMWInst::Add; 259 break; 260 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 262 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 264 Op = AtomicRMWInst::Sub; 265 break; 266 case Intrinsic::amdgcn_struct_buffer_atomic_and: 267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 268 case Intrinsic::amdgcn_raw_buffer_atomic_and: 269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 270 Op = AtomicRMWInst::And; 271 break; 272 case Intrinsic::amdgcn_struct_buffer_atomic_or: 273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 274 case Intrinsic::amdgcn_raw_buffer_atomic_or: 275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 276 Op = AtomicRMWInst::Or; 277 break; 278 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 280 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 282 Op = AtomicRMWInst::Xor; 283 break; 284 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 286 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 288 Op = AtomicRMWInst::Min; 289 break; 290 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 292 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 294 Op = AtomicRMWInst::UMin; 295 break; 296 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 298 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 300 Op = AtomicRMWInst::Max; 301 break; 302 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 304 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 306 Op = AtomicRMWInst::UMax; 307 break; 308 } 309 310 const unsigned ValIdx = 0; 311 312 const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx)); 313 314 // If the value operand is divergent, each lane is contributing a different 315 // value to the atomic calculation. We can only optimize divergent values if 316 // we have DPP available on our subtarget (for DPP strategy), and the atomic 317 // operation is 32 or 64 bits. 318 if (ValDivergent) { 319 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP()) 320 return; 321 322 if (!isLegalCrossLaneType(I.getType())) 323 return; 324 } 325 326 // If any of the other arguments to the intrinsic are divergent, we can't 327 // optimize the operation. 328 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 329 if (UA.isDivergentUse(I.getOperandUse(Idx))) 330 return; 331 } 332 333 // If we get here, we can optimize the atomic using a single wavefront-wide 334 // atomic operation to do the calculation for the entire wavefront, so 335 // remember the instruction so we can come back to it. 336 ToReplace.push_back({&I, Op, ValIdx, ValDivergent}); 337 } 338 339 // Use the builder to create the non-atomic counterpart of the specified 340 // atomicrmw binary op. 341 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 342 Value *LHS, Value *RHS) { 343 CmpInst::Predicate Pred; 344 345 switch (Op) { 346 default: 347 llvm_unreachable("Unhandled atomic op"); 348 case AtomicRMWInst::Add: 349 return B.CreateBinOp(Instruction::Add, LHS, RHS); 350 case AtomicRMWInst::FAdd: 351 return B.CreateFAdd(LHS, RHS); 352 case AtomicRMWInst::Sub: 353 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 354 case AtomicRMWInst::FSub: 355 return B.CreateFSub(LHS, RHS); 356 case AtomicRMWInst::And: 357 return B.CreateBinOp(Instruction::And, LHS, RHS); 358 case AtomicRMWInst::Or: 359 return B.CreateBinOp(Instruction::Or, LHS, RHS); 360 case AtomicRMWInst::Xor: 361 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 362 363 case AtomicRMWInst::Max: 364 Pred = CmpInst::ICMP_SGT; 365 break; 366 case AtomicRMWInst::Min: 367 Pred = CmpInst::ICMP_SLT; 368 break; 369 case AtomicRMWInst::UMax: 370 Pred = CmpInst::ICMP_UGT; 371 break; 372 case AtomicRMWInst::UMin: 373 Pred = CmpInst::ICMP_ULT; 374 break; 375 case AtomicRMWInst::FMax: 376 return B.CreateMaxNum(LHS, RHS); 377 case AtomicRMWInst::FMin: 378 return B.CreateMinNum(LHS, RHS); 379 } 380 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 381 return B.CreateSelect(Cond, LHS, RHS); 382 } 383 384 // Use the builder to create a reduction of V across the wavefront, with all 385 // lanes active, returning the same result in all lanes. 386 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 387 AtomicRMWInst::BinOp Op, 388 Value *V, 389 Value *const Identity) const { 390 Type *AtomicTy = V->getType(); 391 Module *M = B.GetInsertBlock()->getModule(); 392 393 // Reduce within each row of 16 lanes. 394 for (unsigned Idx = 0; Idx < 4; Idx++) { 395 V = buildNonAtomicBinOp( 396 B, Op, V, 397 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy, 398 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 399 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 400 } 401 402 // Reduce within each pair of rows (i.e. 32 lanes). 403 assert(ST.hasPermLaneX16()); 404 Value *Permlanex16Call = 405 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, 406 {PoisonValue::get(AtomicTy), V, B.getInt32(0), 407 B.getInt32(0), B.getFalse(), B.getFalse()}); 408 V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); 409 if (ST.isWave32()) { 410 return V; 411 } 412 413 if (ST.hasPermLane64()) { 414 // Reduce across the upper and lower 32 lanes. 415 Value *Permlane64Call = 416 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V); 417 return buildNonAtomicBinOp(B, Op, V, Permlane64Call); 418 } 419 420 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 421 // combine them with a scalar operation. 422 Function *ReadLane = Intrinsic::getOrInsertDeclaration( 423 M, Intrinsic::amdgcn_readlane, AtomicTy); 424 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 425 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 426 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 427 } 428 429 // Use the builder to create an inclusive scan of V across the wavefront, with 430 // all lanes active. 431 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 432 AtomicRMWInst::BinOp Op, Value *V, 433 Value *Identity) const { 434 Type *AtomicTy = V->getType(); 435 Module *M = B.GetInsertBlock()->getModule(); 436 Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( 437 M, Intrinsic::amdgcn_update_dpp, AtomicTy); 438 439 for (unsigned Idx = 0; Idx < 4; Idx++) { 440 V = buildNonAtomicBinOp( 441 B, Op, V, 442 B.CreateCall(UpdateDPP, 443 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 444 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 445 } 446 if (ST.hasDPPBroadcasts()) { 447 // GFX9 has DPP row broadcast operations. 448 V = buildNonAtomicBinOp( 449 B, Op, V, 450 B.CreateCall(UpdateDPP, 451 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 452 B.getInt32(0xf), B.getFalse()})); 453 V = buildNonAtomicBinOp( 454 B, Op, V, 455 B.CreateCall(UpdateDPP, 456 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 457 B.getInt32(0xf), B.getFalse()})); 458 } else { 459 // On GFX10 all DPP operations are confined to a single row. To get cross- 460 // row operations we have to use permlane or readlane. 461 462 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 463 // 48..63). 464 assert(ST.hasPermLaneX16()); 465 Value *PermX = 466 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16, 467 {PoisonValue::get(AtomicTy), V, B.getInt32(-1), 468 B.getInt32(-1), B.getFalse(), B.getFalse()}); 469 470 Value *UpdateDPPCall = B.CreateCall( 471 UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 472 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); 473 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 474 475 if (!ST.isWave32()) { 476 // Combine lane 31 into lanes 32..63. 477 Value *const Lane31 = B.CreateIntrinsic( 478 AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 479 480 Value *UpdateDPPCall = B.CreateCall( 481 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 482 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 483 484 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 485 } 486 } 487 return V; 488 } 489 490 // Use the builder to create a shift right of V across the wavefront, with all 491 // lanes active, to turn an inclusive scan into an exclusive scan. 492 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 493 Value *Identity) const { 494 Type *AtomicTy = V->getType(); 495 Module *M = B.GetInsertBlock()->getModule(); 496 Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( 497 M, Intrinsic::amdgcn_update_dpp, AtomicTy); 498 if (ST.hasDPPWavefrontShifts()) { 499 // GFX9 has DPP wavefront shift operations. 500 V = B.CreateCall(UpdateDPP, 501 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 502 B.getInt32(0xf), B.getFalse()}); 503 } else { 504 Function *ReadLane = Intrinsic::getOrInsertDeclaration( 505 M, Intrinsic::amdgcn_readlane, AtomicTy); 506 Function *WriteLane = Intrinsic::getOrInsertDeclaration( 507 M, Intrinsic::amdgcn_writelane, AtomicTy); 508 509 // On GFX10 all DPP operations are confined to a single row. To get cross- 510 // row operations we have to use permlane or readlane. 511 Value *Old = V; 512 V = B.CreateCall(UpdateDPP, 513 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 514 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 515 516 // Copy the old lane 15 to the new lane 16. 517 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 518 B.getInt32(16), V}); 519 520 if (!ST.isWave32()) { 521 // Copy the old lane 31 to the new lane 32. 522 V = B.CreateCall( 523 WriteLane, 524 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 525 526 // Copy the old lane 47 to the new lane 48. 527 V = B.CreateCall( 528 WriteLane, 529 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 530 } 531 } 532 533 return V; 534 } 535 536 // Use the builder to create an exclusive scan and compute the final reduced 537 // value using an iterative approach. This provides an alternative 538 // implementation to DPP which uses WMM for scan computations. This API iterate 539 // over active lanes to read, compute and update the value using 540 // readlane and writelane intrinsics. 541 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 542 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 543 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 544 auto *Ty = I.getType(); 545 auto *WaveTy = B.getIntNTy(ST.getWavefrontSize()); 546 auto *EntryBB = I.getParent(); 547 auto NeedResult = !I.use_empty(); 548 549 auto *Ballot = 550 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 551 552 // Start inserting instructions for ComputeLoop block 553 B.SetInsertPoint(ComputeLoop); 554 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 555 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 556 Accumulator->addIncoming(Identity, EntryBB); 557 PHINode *OldValuePhi = nullptr; 558 if (NeedResult) { 559 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 560 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 561 } 562 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 563 ActiveBits->addIncoming(Ballot, EntryBB); 564 565 // Use llvm.cttz intrinsic to find the lowest remaining active lane. 566 auto *FF1 = 567 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 568 569 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); 570 571 // Get the value required for atomic operation 572 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 573 {V, LaneIdxInt}); 574 575 // Perform writelane if intermediate scan results are required later in the 576 // kernel computations 577 Value *OldValue = nullptr; 578 if (NeedResult) { 579 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, 580 {Accumulator, LaneIdxInt, OldValuePhi}); 581 OldValuePhi->addIncoming(OldValue, ComputeLoop); 582 } 583 584 // Accumulate the results 585 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 586 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 587 588 // Set bit to zero of current active lane so that for next iteration llvm.cttz 589 // return the next active lane 590 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 591 592 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 593 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 594 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 595 596 // Branch out of the loop when all lanes are processed. 597 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 598 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 599 600 B.SetInsertPoint(ComputeEnd); 601 602 return {OldValue, NewAccumulator}; 603 } 604 605 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 606 AtomicRMWInst::BinOp Op) { 607 LLVMContext &C = Ty->getContext(); 608 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 609 switch (Op) { 610 default: 611 llvm_unreachable("Unhandled atomic op"); 612 case AtomicRMWInst::Add: 613 case AtomicRMWInst::Sub: 614 case AtomicRMWInst::Or: 615 case AtomicRMWInst::Xor: 616 case AtomicRMWInst::UMax: 617 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 618 case AtomicRMWInst::And: 619 case AtomicRMWInst::UMin: 620 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 621 case AtomicRMWInst::Max: 622 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 623 case AtomicRMWInst::Min: 624 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 625 case AtomicRMWInst::FAdd: 626 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 627 case AtomicRMWInst::FSub: 628 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 629 case AtomicRMWInst::FMin: 630 case AtomicRMWInst::FMax: 631 // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the 632 // closest thing they have to an identity, but it still does not preserve 633 // the difference between quiet and signaling NaNs or NaNs with different 634 // payloads. 635 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); 636 } 637 } 638 639 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 640 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 641 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 642 } 643 644 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 645 AtomicRMWInst::BinOp Op, 646 unsigned ValIdx, 647 bool ValDivergent) const { 648 // Start building just before the instruction. 649 IRBuilder<> B(&I); 650 651 if (AtomicRMWInst::isFPOperation(Op)) { 652 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 653 } 654 655 // If we are in a pixel shader, because of how we have to mask out helper 656 // lane invocations, we need to record the entry and exit BB's. 657 BasicBlock *PixelEntryBB = nullptr; 658 BasicBlock *PixelExitBB = nullptr; 659 660 // If we're optimizing an atomic within a pixel shader, we need to wrap the 661 // entire atomic operation in a helper-lane check. We do not want any helper 662 // lanes that are around only for the purposes of derivatives to take part 663 // in any cross-lane communication, and we use a branch on whether the lane is 664 // live to do this. 665 if (IsPixelShader) { 666 // Record I's original position as the entry block. 667 PixelEntryBB = I.getParent(); 668 669 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}); 670 Instruction *const NonHelperTerminator = 671 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 672 673 // Record I's new position as the exit block. 674 PixelExitBB = I.getParent(); 675 676 I.moveBefore(NonHelperTerminator->getIterator()); 677 B.SetInsertPoint(&I); 678 } 679 680 Type *const Ty = I.getType(); 681 Type *Int32Ty = B.getInt32Ty(); 682 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 683 [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty); 684 685 // This is the value in the atomic operation we need to combine in order to 686 // reduce the number of atomic operations. 687 Value *V = I.getOperand(ValIdx); 688 689 // We need to know how many lanes are active within the wavefront, and we do 690 // this by doing a ballot of active lanes. 691 Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize()); 692 CallInst *const Ballot = 693 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 694 695 // We need to know how many lanes are active within the wavefront that are 696 // below us. If we counted each lane linearly starting from 0, a lane is 697 // below us only if its associated index was less than ours. We do this by 698 // using the mbcnt intrinsic. 699 Value *Mbcnt; 700 if (ST.isWave32()) { 701 Mbcnt = 702 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {Ballot, B.getInt32(0)}); 703 } else { 704 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 705 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 706 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, 707 {ExtractLo, B.getInt32(0)}); 708 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt}); 709 } 710 711 Function *F = I.getFunction(); 712 LLVMContext &C = F->getContext(); 713 714 // For atomic sub, perform scan with add operation and allow one lane to 715 // subtract the reduced value later. 716 AtomicRMWInst::BinOp ScanOp = Op; 717 if (Op == AtomicRMWInst::Sub) { 718 ScanOp = AtomicRMWInst::Add; 719 } else if (Op == AtomicRMWInst::FSub) { 720 ScanOp = AtomicRMWInst::FAdd; 721 } 722 Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 723 724 Value *ExclScan = nullptr; 725 Value *NewV = nullptr; 726 727 const bool NeedResult = !I.use_empty(); 728 729 BasicBlock *ComputeLoop = nullptr; 730 BasicBlock *ComputeEnd = nullptr; 731 // If we have a divergent value in each lane, we need to combine the value 732 // using DPP. 733 if (ValDivergent) { 734 if (ScanImpl == ScanOptions::DPP) { 735 // First we need to set all inactive invocations to the identity value, so 736 // that they can correctly contribute to the final result. 737 NewV = 738 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 739 if (!NeedResult && ST.hasPermLaneX16()) { 740 // On GFX10 the permlanex16 instruction helps us build a reduction 741 // without too many readlanes and writelanes, which are generally bad 742 // for performance. 743 NewV = buildReduction(B, ScanOp, NewV, Identity); 744 } else { 745 NewV = buildScan(B, ScanOp, NewV, Identity); 746 if (NeedResult) 747 ExclScan = buildShiftRight(B, NewV, Identity); 748 // Read the value from the last lane, which has accumulated the values 749 // of each active lane in the wavefront. This will be our new value 750 // which we will provide to the atomic operation. 751 Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1); 752 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, 753 {NewV, LastLaneIdx}); 754 } 755 // Finally mark the readlanes in the WWM section. 756 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 757 } else if (ScanImpl == ScanOptions::Iterative) { 758 // Alternative implementation for scan 759 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 760 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 761 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 762 ComputeLoop, ComputeEnd); 763 } else { 764 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 765 } 766 } else { 767 switch (Op) { 768 default: 769 llvm_unreachable("Unhandled atomic op"); 770 771 case AtomicRMWInst::Add: 772 case AtomicRMWInst::Sub: { 773 // The new value we will be contributing to the atomic operation is the 774 // old value times the number of active lanes. 775 Value *const Ctpop = B.CreateIntCast( 776 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 777 NewV = buildMul(B, V, Ctpop); 778 break; 779 } 780 case AtomicRMWInst::FAdd: 781 case AtomicRMWInst::FSub: { 782 Value *const Ctpop = B.CreateIntCast( 783 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 784 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 785 NewV = B.CreateFMul(V, CtpopFP); 786 break; 787 } 788 case AtomicRMWInst::And: 789 case AtomicRMWInst::Or: 790 case AtomicRMWInst::Max: 791 case AtomicRMWInst::Min: 792 case AtomicRMWInst::UMax: 793 case AtomicRMWInst::UMin: 794 case AtomicRMWInst::FMin: 795 case AtomicRMWInst::FMax: 796 // These operations with a uniform value are idempotent: doing the atomic 797 // operation multiple times has the same effect as doing it once. 798 NewV = V; 799 break; 800 801 case AtomicRMWInst::Xor: 802 // The new value we will be contributing to the atomic operation is the 803 // old value times the parity of the number of active lanes. 804 Value *const Ctpop = B.CreateIntCast( 805 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 806 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 807 break; 808 } 809 } 810 811 // We only want a single lane to enter our new control flow, and we do this 812 // by checking if there are any active lanes below us. Only one lane will 813 // have 0 active lanes below us, so that will be the only one to progress. 814 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 815 816 // Store I's original basic block before we split the block. 817 BasicBlock *const OriginalBB = I.getParent(); 818 819 // We need to introduce some new control flow to force a single lane to be 820 // active. We do this by splitting I's basic block at I, and introducing the 821 // new block such that: 822 // entry --> single_lane -\ 823 // \------------------> exit 824 Instruction *const SingleLaneTerminator = 825 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 826 827 // At this point, we have split the I's block to allow one lane in wavefront 828 // to update the precomputed reduced value. Also, completed the codegen for 829 // new control flow i.e. iterative loop which perform reduction and scan using 830 // ComputeLoop and ComputeEnd. 831 // For the new control flow, we need to move branch instruction i.e. 832 // terminator created during SplitBlockAndInsertIfThen from I's block to 833 // ComputeEnd block. We also need to set up predecessor to next block when 834 // single lane done updating the final reduced value. 835 BasicBlock *Predecessor = nullptr; 836 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 837 // Move terminator from I's block to ComputeEnd block. 838 // 839 // OriginalBB is known to have a branch as terminator because 840 // SplitBlockAndInsertIfThen will have inserted one. 841 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 842 B.SetInsertPoint(ComputeEnd); 843 Terminator->removeFromParent(); 844 B.Insert(Terminator); 845 846 // Branch to ComputeLoop Block unconditionally from the I's block for 847 // iterative approach. 848 B.SetInsertPoint(OriginalBB); 849 B.CreateBr(ComputeLoop); 850 851 // Update the dominator tree for new control flow. 852 SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 853 {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 854 {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 855 856 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 857 // the DT edges as well. 858 for (auto *Succ : Terminator->successors()) { 859 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 860 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 861 } 862 863 DTU.applyUpdates(DomTreeUpdates); 864 865 Predecessor = ComputeEnd; 866 } else { 867 Predecessor = OriginalBB; 868 } 869 // Move the IR builder into single_lane next. 870 B.SetInsertPoint(SingleLaneTerminator); 871 872 // Clone the original atomic operation into single lane, replacing the 873 // original value with our newly created one. 874 Instruction *const NewI = I.clone(); 875 B.Insert(NewI); 876 NewI->setOperand(ValIdx, NewV); 877 878 // Move the IR builder into exit next, and start inserting just before the 879 // original instruction. 880 B.SetInsertPoint(&I); 881 882 if (NeedResult) { 883 // Create a PHI node to get our new atomic result into the exit block. 884 PHINode *const PHI = B.CreatePHI(Ty, 2); 885 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 886 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 887 888 // We need to broadcast the value who was the lowest active lane (the first 889 // lane) to all other lanes in the wavefront. 890 891 Value *ReadlaneVal = PHI; 892 if (TyBitWidth < 32) 893 ReadlaneVal = B.CreateZExt(PHI, B.getInt32Ty()); 894 895 Value *BroadcastI = B.CreateIntrinsic( 896 ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal); 897 if (TyBitWidth < 32) 898 BroadcastI = B.CreateTrunc(BroadcastI, Ty); 899 900 // Now that we have the result of our single atomic operation, we need to 901 // get our individual lane's slice into the result. We use the lane offset 902 // we previously calculated combined with the atomic result value we got 903 // from the first lane, to get our lane's index into the atomic result. 904 Value *LaneOffset = nullptr; 905 if (ValDivergent) { 906 if (ScanImpl == ScanOptions::DPP) { 907 LaneOffset = 908 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 909 } else if (ScanImpl == ScanOptions::Iterative) { 910 LaneOffset = ExclScan; 911 } else { 912 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 913 } 914 } else { 915 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 916 : B.CreateIntCast(Mbcnt, Ty, false); 917 switch (Op) { 918 default: 919 llvm_unreachable("Unhandled atomic op"); 920 case AtomicRMWInst::Add: 921 case AtomicRMWInst::Sub: 922 LaneOffset = buildMul(B, V, Mbcnt); 923 break; 924 case AtomicRMWInst::And: 925 case AtomicRMWInst::Or: 926 case AtomicRMWInst::Max: 927 case AtomicRMWInst::Min: 928 case AtomicRMWInst::UMax: 929 case AtomicRMWInst::UMin: 930 case AtomicRMWInst::FMin: 931 case AtomicRMWInst::FMax: 932 LaneOffset = B.CreateSelect(Cond, Identity, V); 933 break; 934 case AtomicRMWInst::Xor: 935 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 936 break; 937 case AtomicRMWInst::FAdd: 938 case AtomicRMWInst::FSub: { 939 LaneOffset = B.CreateFMul(V, Mbcnt); 940 break; 941 } 942 } 943 } 944 Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 945 if (isAtomicFloatingPointTy) { 946 // For fadd/fsub the first active lane of LaneOffset should be the 947 // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated 948 // is V * +0.0 which might have the wrong sign or might be nan (if V is 949 // inf or nan). 950 // 951 // For all floating point ops if the in-memory value was a nan then the 952 // binop we just built might have quieted it or changed its payload. 953 // 954 // Correct all these problems by using BroadcastI as the result in the 955 // first active lane. 956 Result = B.CreateSelect(Cond, BroadcastI, Result); 957 } 958 959 if (IsPixelShader) { 960 // Need a final PHI to reconverge to above the helper lane branch mask. 961 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 962 963 PHINode *const PHI = B.CreatePHI(Ty, 2); 964 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 965 PHI->addIncoming(Result, I.getParent()); 966 I.replaceAllUsesWith(PHI); 967 } else { 968 // Replace the original atomic instruction with the new one. 969 I.replaceAllUsesWith(Result); 970 } 971 } 972 973 // And delete the original. 974 I.eraseFromParent(); 975 } 976 977 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 978 "AMDGPU atomic optimizations", false, false) 979 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 980 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 981 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 982 "AMDGPU atomic optimizations", false, false) 983 984 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 985 return new AMDGPUAtomicOptimizer(ScanStrategy); 986 } 987