1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 /// Atomic optimizer uses following strategies to compute scan and reduced 14 /// values 15 /// 1. DPP - 16 /// This is the most efficient implementation for scan. DPP uses Whole Wave 17 /// Mode (WWM) 18 /// 2. Iterative - 19 // An alternative implementation iterates over all active lanes 20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane 21 /// intrinsics 22 //===----------------------------------------------------------------------===// 23 24 #include "AMDGPU.h" 25 #include "GCNSubtarget.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/UniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicsAMDGPU.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 35 36 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 namespace { 42 43 struct ReplacementInfo { 44 Instruction *I; 45 AtomicRMWInst::BinOp Op; 46 unsigned ValIdx; 47 bool ValDivergent; 48 }; 49 50 class AMDGPUAtomicOptimizer : public FunctionPass { 51 public: 52 static char ID; 53 ScanOptions ScanImpl; 54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl) 55 : FunctionPass(ID), ScanImpl(ScanImpl) {} 56 57 bool runOnFunction(Function &F) override; 58 59 void getAnalysisUsage(AnalysisUsage &AU) const override { 60 AU.addPreserved<DominatorTreeWrapperPass>(); 61 AU.addRequired<UniformityInfoWrapperPass>(); 62 AU.addRequired<TargetPassConfig>(); 63 } 64 }; 65 66 class AMDGPUAtomicOptimizerImpl 67 : public InstVisitor<AMDGPUAtomicOptimizerImpl> { 68 private: 69 SmallVector<ReplacementInfo, 8> ToReplace; 70 const UniformityInfo *UA; 71 const DataLayout *DL; 72 DomTreeUpdater &DTU; 73 const GCNSubtarget *ST; 74 bool IsPixelShader; 75 ScanOptions ScanImpl; 76 77 Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 78 Value *const Identity) const; 79 Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 80 Value *const Identity) const; 81 Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 82 83 std::pair<Value *, Value *> 84 buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 85 Value *const Identity, Value *V, Instruction &I, 86 BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; 87 88 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 89 bool ValDivergent) const; 90 91 public: 92 AMDGPUAtomicOptimizerImpl() = delete; 93 94 AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, 95 DomTreeUpdater &DTU, const GCNSubtarget *ST, 96 bool IsPixelShader, ScanOptions ScanImpl) 97 : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), 98 ScanImpl(ScanImpl) {} 99 100 bool run(Function &F); 101 102 void visitAtomicRMWInst(AtomicRMWInst &I); 103 void visitIntrinsicInst(IntrinsicInst &I); 104 }; 105 106 } // namespace 107 108 char AMDGPUAtomicOptimizer::ID = 0; 109 110 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 111 112 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 113 if (skipFunction(F)) { 114 return false; 115 } 116 117 const UniformityInfo *UA = 118 &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 119 const DataLayout *DL = &F.getDataLayout(); 120 121 DominatorTreeWrapperPass *const DTW = 122 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, 124 DomTreeUpdater::UpdateStrategy::Lazy); 125 126 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 127 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 128 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 129 130 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 131 132 return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 133 .run(F); 134 } 135 136 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, 137 FunctionAnalysisManager &AM) { 138 139 const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); 140 const DataLayout *DL = &F.getDataLayout(); 141 142 DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), 143 DomTreeUpdater::UpdateStrategy::Lazy); 144 const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); 145 146 bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 147 148 bool IsChanged = 149 AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) 150 .run(F); 151 152 if (!IsChanged) { 153 return PreservedAnalyses::all(); 154 } 155 156 PreservedAnalyses PA; 157 PA.preserve<DominatorTreeAnalysis>(); 158 return PA; 159 } 160 161 bool AMDGPUAtomicOptimizerImpl::run(Function &F) { 162 163 // Scan option None disables the Pass 164 if (ScanImpl == ScanOptions::None) { 165 return false; 166 } 167 168 visit(F); 169 170 const bool Changed = !ToReplace.empty(); 171 172 for (ReplacementInfo &Info : ToReplace) { 173 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 174 } 175 176 ToReplace.clear(); 177 178 return Changed; 179 } 180 181 static bool isLegalCrossLaneType(Type *Ty) { 182 switch (Ty->getTypeID()) { 183 case Type::FloatTyID: 184 case Type::DoubleTyID: 185 return true; 186 case Type::IntegerTyID: { 187 unsigned Size = Ty->getIntegerBitWidth(); 188 return (Size == 32 || Size == 64); 189 } 190 default: 191 return false; 192 } 193 } 194 195 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { 196 // Early exit for unhandled address space atomic instructions. 197 switch (I.getPointerAddressSpace()) { 198 default: 199 return; 200 case AMDGPUAS::GLOBAL_ADDRESS: 201 case AMDGPUAS::LOCAL_ADDRESS: 202 break; 203 } 204 205 AtomicRMWInst::BinOp Op = I.getOperation(); 206 207 switch (Op) { 208 default: 209 return; 210 case AtomicRMWInst::Add: 211 case AtomicRMWInst::Sub: 212 case AtomicRMWInst::And: 213 case AtomicRMWInst::Or: 214 case AtomicRMWInst::Xor: 215 case AtomicRMWInst::Max: 216 case AtomicRMWInst::Min: 217 case AtomicRMWInst::UMax: 218 case AtomicRMWInst::UMin: 219 case AtomicRMWInst::FAdd: 220 case AtomicRMWInst::FSub: 221 case AtomicRMWInst::FMax: 222 case AtomicRMWInst::FMin: 223 break; 224 } 225 226 // Only 32 and 64 bit floating point atomic ops are supported. 227 if (AtomicRMWInst::isFPOperation(Op) && 228 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { 229 return; 230 } 231 232 const unsigned PtrIdx = 0; 233 const unsigned ValIdx = 1; 234 235 // If the pointer operand is divergent, then each lane is doing an atomic 236 // operation on a different address, and we cannot optimize that. 237 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { 238 return; 239 } 240 241 bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 242 243 // If the value operand is divergent, each lane is contributing a different 244 // value to the atomic calculation. We can only optimize divergent values if 245 // we have DPP available on our subtarget (for DPP strategy), and the atomic 246 // operation is 32 or 64 bits. 247 if (ValDivergent) { 248 if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) 249 return; 250 251 if (!isLegalCrossLaneType(I.getType())) 252 return; 253 } 254 255 // If we get here, we can optimize the atomic using a single wavefront-wide 256 // atomic operation to do the calculation for the entire wavefront, so 257 // remember the instruction so we can come back to it. 258 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 259 260 ToReplace.push_back(Info); 261 } 262 263 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { 264 AtomicRMWInst::BinOp Op; 265 266 switch (I.getIntrinsicID()) { 267 default: 268 return; 269 case Intrinsic::amdgcn_struct_buffer_atomic_add: 270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 271 case Intrinsic::amdgcn_raw_buffer_atomic_add: 272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 273 Op = AtomicRMWInst::Add; 274 break; 275 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 276 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 277 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 278 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 279 Op = AtomicRMWInst::Sub; 280 break; 281 case Intrinsic::amdgcn_struct_buffer_atomic_and: 282 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 283 case Intrinsic::amdgcn_raw_buffer_atomic_and: 284 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 285 Op = AtomicRMWInst::And; 286 break; 287 case Intrinsic::amdgcn_struct_buffer_atomic_or: 288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 289 case Intrinsic::amdgcn_raw_buffer_atomic_or: 290 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 291 Op = AtomicRMWInst::Or; 292 break; 293 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 295 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 297 Op = AtomicRMWInst::Xor; 298 break; 299 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 301 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 302 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 303 Op = AtomicRMWInst::Min; 304 break; 305 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 307 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 308 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 309 Op = AtomicRMWInst::UMin; 310 break; 311 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 312 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 313 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 314 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 315 Op = AtomicRMWInst::Max; 316 break; 317 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 319 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 320 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 321 Op = AtomicRMWInst::UMax; 322 break; 323 } 324 325 const unsigned ValIdx = 0; 326 327 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); 328 329 // If the value operand is divergent, each lane is contributing a different 330 // value to the atomic calculation. We can only optimize divergent values if 331 // we have DPP available on our subtarget (for DPP strategy), and the atomic 332 // operation is 32 or 64 bits. 333 if (ValDivergent) { 334 if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) 335 return; 336 337 if (!isLegalCrossLaneType(I.getType())) 338 return; 339 } 340 341 // If any of the other arguments to the intrinsic are divergent, we can't 342 // optimize the operation. 343 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 344 if (UA->isDivergentUse(I.getOperandUse(Idx))) { 345 return; 346 } 347 } 348 349 // If we get here, we can optimize the atomic using a single wavefront-wide 350 // atomic operation to do the calculation for the entire wavefront, so 351 // remember the instruction so we can come back to it. 352 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 353 354 ToReplace.push_back(Info); 355 } 356 357 // Use the builder to create the non-atomic counterpart of the specified 358 // atomicrmw binary op. 359 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 360 Value *LHS, Value *RHS) { 361 CmpInst::Predicate Pred; 362 363 switch (Op) { 364 default: 365 llvm_unreachable("Unhandled atomic op"); 366 case AtomicRMWInst::Add: 367 return B.CreateBinOp(Instruction::Add, LHS, RHS); 368 case AtomicRMWInst::FAdd: 369 return B.CreateFAdd(LHS, RHS); 370 case AtomicRMWInst::Sub: 371 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 372 case AtomicRMWInst::FSub: 373 return B.CreateFSub(LHS, RHS); 374 case AtomicRMWInst::And: 375 return B.CreateBinOp(Instruction::And, LHS, RHS); 376 case AtomicRMWInst::Or: 377 return B.CreateBinOp(Instruction::Or, LHS, RHS); 378 case AtomicRMWInst::Xor: 379 return B.CreateBinOp(Instruction::Xor, LHS, RHS); 380 381 case AtomicRMWInst::Max: 382 Pred = CmpInst::ICMP_SGT; 383 break; 384 case AtomicRMWInst::Min: 385 Pred = CmpInst::ICMP_SLT; 386 break; 387 case AtomicRMWInst::UMax: 388 Pred = CmpInst::ICMP_UGT; 389 break; 390 case AtomicRMWInst::UMin: 391 Pred = CmpInst::ICMP_ULT; 392 break; 393 case AtomicRMWInst::FMax: 394 return B.CreateMaxNum(LHS, RHS); 395 case AtomicRMWInst::FMin: 396 return B.CreateMinNum(LHS, RHS); 397 } 398 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 399 return B.CreateSelect(Cond, LHS, RHS); 400 } 401 402 // Use the builder to create a reduction of V across the wavefront, with all 403 // lanes active, returning the same result in all lanes. 404 Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, 405 AtomicRMWInst::BinOp Op, 406 Value *V, 407 Value *const Identity) const { 408 Type *AtomicTy = V->getType(); 409 Module *M = B.GetInsertBlock()->getModule(); 410 Function *UpdateDPP = 411 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 412 413 // Reduce within each row of 16 lanes. 414 for (unsigned Idx = 0; Idx < 4; Idx++) { 415 V = buildNonAtomicBinOp( 416 B, Op, V, 417 B.CreateCall(UpdateDPP, 418 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 419 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 420 } 421 422 // Reduce within each pair of rows (i.e. 32 lanes). 423 assert(ST->hasPermLaneX16()); 424 Value *Permlanex16Call = B.CreateIntrinsic( 425 V->getType(), Intrinsic::amdgcn_permlanex16, 426 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 427 V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call); 428 if (ST->isWave32()) { 429 return V; 430 } 431 432 if (ST->hasPermLane64()) { 433 // Reduce across the upper and lower 32 lanes. 434 Value *Permlane64Call = 435 B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); 436 return buildNonAtomicBinOp(B, Op, V, Permlane64Call); 437 } 438 439 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 440 // combine them with a scalar operation. 441 Function *ReadLane = 442 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 443 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 444 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 445 return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 446 } 447 448 // Use the builder to create an inclusive scan of V across the wavefront, with 449 // all lanes active. 450 Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, 451 AtomicRMWInst::BinOp Op, Value *V, 452 Value *Identity) const { 453 Type *AtomicTy = V->getType(); 454 Module *M = B.GetInsertBlock()->getModule(); 455 Function *UpdateDPP = 456 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 457 458 for (unsigned Idx = 0; Idx < 4; Idx++) { 459 V = buildNonAtomicBinOp( 460 B, Op, V, 461 B.CreateCall(UpdateDPP, 462 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 463 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 464 } 465 if (ST->hasDPPBroadcasts()) { 466 // GFX9 has DPP row broadcast operations. 467 V = buildNonAtomicBinOp( 468 B, Op, V, 469 B.CreateCall(UpdateDPP, 470 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 471 B.getInt32(0xf), B.getFalse()})); 472 V = buildNonAtomicBinOp( 473 B, Op, V, 474 B.CreateCall(UpdateDPP, 475 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 476 B.getInt32(0xf), B.getFalse()})); 477 } else { 478 // On GFX10 all DPP operations are confined to a single row. To get cross- 479 // row operations we have to use permlane or readlane. 480 481 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 482 // 48..63). 483 assert(ST->hasPermLaneX16()); 484 Value *PermX = B.CreateIntrinsic( 485 V->getType(), Intrinsic::amdgcn_permlanex16, 486 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 487 488 Value *UpdateDPPCall = B.CreateCall( 489 UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 490 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); 491 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 492 493 if (!ST->isWave32()) { 494 // Combine lane 31 into lanes 32..63. 495 Value *const Lane31 = B.CreateIntrinsic( 496 V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); 497 498 Value *UpdateDPPCall = B.CreateCall( 499 UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 500 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); 501 502 V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall); 503 } 504 } 505 return V; 506 } 507 508 // Use the builder to create a shift right of V across the wavefront, with all 509 // lanes active, to turn an inclusive scan into an exclusive scan. 510 Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, 511 Value *Identity) const { 512 Type *AtomicTy = V->getType(); 513 Module *M = B.GetInsertBlock()->getModule(); 514 Function *UpdateDPP = 515 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); 516 if (ST->hasDPPWavefrontShifts()) { 517 // GFX9 has DPP wavefront shift operations. 518 V = B.CreateCall(UpdateDPP, 519 {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 520 B.getInt32(0xf), B.getFalse()}); 521 } else { 522 Function *ReadLane = 523 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); 524 Function *WriteLane = 525 Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); 526 527 // On GFX10 all DPP operations are confined to a single row. To get cross- 528 // row operations we have to use permlane or readlane. 529 Value *Old = V; 530 V = B.CreateCall(UpdateDPP, 531 {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 532 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 533 534 // Copy the old lane 15 to the new lane 16. 535 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 536 B.getInt32(16), V}); 537 538 if (!ST->isWave32()) { 539 // Copy the old lane 31 to the new lane 32. 540 V = B.CreateCall( 541 WriteLane, 542 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 543 544 // Copy the old lane 47 to the new lane 48. 545 V = B.CreateCall( 546 WriteLane, 547 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 548 } 549 } 550 551 return V; 552 } 553 554 // Use the builder to create an exclusive scan and compute the final reduced 555 // value using an iterative approach. This provides an alternative 556 // implementation to DPP which uses WMM for scan computations. This API iterate 557 // over active lanes to read, compute and update the value using 558 // readlane and writelane intrinsics. 559 std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( 560 IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, 561 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { 562 auto *Ty = I.getType(); 563 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); 564 auto *EntryBB = I.getParent(); 565 auto NeedResult = !I.use_empty(); 566 567 auto *Ballot = 568 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 569 570 // Start inserting instructions for ComputeLoop block 571 B.SetInsertPoint(ComputeLoop); 572 // Phi nodes for Accumulator, Scan results destination, and Active Lanes 573 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); 574 Accumulator->addIncoming(Identity, EntryBB); 575 PHINode *OldValuePhi = nullptr; 576 if (NeedResult) { 577 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); 578 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); 579 } 580 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); 581 ActiveBits->addIncoming(Ballot, EntryBB); 582 583 // Use llvm.cttz instrinsic to find the lowest remaining active lane. 584 auto *FF1 = 585 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); 586 587 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); 588 589 // Get the value required for atomic operation 590 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, 591 {V, LaneIdxInt}); 592 593 // Perform writelane if intermediate scan results are required later in the 594 // kernel computations 595 Value *OldValue = nullptr; 596 if (NeedResult) { 597 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, 598 {Accumulator, LaneIdxInt, OldValuePhi}); 599 OldValuePhi->addIncoming(OldValue, ComputeLoop); 600 } 601 602 // Accumulate the results 603 auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); 604 Accumulator->addIncoming(NewAccumulator, ComputeLoop); 605 606 // Set bit to zero of current active lane so that for next iteration llvm.cttz 607 // return the next active lane 608 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); 609 610 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); 611 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); 612 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); 613 614 // Branch out of the loop when all lanes are processed. 615 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); 616 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); 617 618 B.SetInsertPoint(ComputeEnd); 619 620 return {OldValue, NewAccumulator}; 621 } 622 623 static Constant *getIdentityValueForAtomicOp(Type *const Ty, 624 AtomicRMWInst::BinOp Op) { 625 LLVMContext &C = Ty->getContext(); 626 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); 627 switch (Op) { 628 default: 629 llvm_unreachable("Unhandled atomic op"); 630 case AtomicRMWInst::Add: 631 case AtomicRMWInst::Sub: 632 case AtomicRMWInst::Or: 633 case AtomicRMWInst::Xor: 634 case AtomicRMWInst::UMax: 635 return ConstantInt::get(C, APInt::getMinValue(BitWidth)); 636 case AtomicRMWInst::And: 637 case AtomicRMWInst::UMin: 638 return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); 639 case AtomicRMWInst::Max: 640 return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); 641 case AtomicRMWInst::Min: 642 return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); 643 case AtomicRMWInst::FAdd: 644 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); 645 case AtomicRMWInst::FSub: 646 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); 647 case AtomicRMWInst::FMin: 648 case AtomicRMWInst::FMax: 649 // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the 650 // closest thing they have to an identity, but it still does not preserve 651 // the difference between quiet and signaling NaNs or NaNs with different 652 // payloads. 653 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); 654 } 655 } 656 657 static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 658 const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 659 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 660 } 661 662 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, 663 AtomicRMWInst::BinOp Op, 664 unsigned ValIdx, 665 bool ValDivergent) const { 666 // Start building just before the instruction. 667 IRBuilder<> B(&I); 668 669 if (AtomicRMWInst::isFPOperation(Op)) { 670 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); 671 } 672 673 // If we are in a pixel shader, because of how we have to mask out helper 674 // lane invocations, we need to record the entry and exit BB's. 675 BasicBlock *PixelEntryBB = nullptr; 676 BasicBlock *PixelExitBB = nullptr; 677 678 // If we're optimizing an atomic within a pixel shader, we need to wrap the 679 // entire atomic operation in a helper-lane check. We do not want any helper 680 // lanes that are around only for the purposes of derivatives to take part 681 // in any cross-lane communication, and we use a branch on whether the lane is 682 // live to do this. 683 if (IsPixelShader) { 684 // Record I's original position as the entry block. 685 PixelEntryBB = I.getParent(); 686 687 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 688 Instruction *const NonHelperTerminator = 689 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 690 691 // Record I's new position as the exit block. 692 PixelExitBB = I.getParent(); 693 694 I.moveBefore(NonHelperTerminator); 695 B.SetInsertPoint(&I); 696 } 697 698 Type *const Ty = I.getType(); 699 Type *Int32Ty = B.getInt32Ty(); 700 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); 701 [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 702 703 // This is the value in the atomic operation we need to combine in order to 704 // reduce the number of atomic operations. 705 Value *V = I.getOperand(ValIdx); 706 707 // We need to know how many lanes are active within the wavefront, and we do 708 // this by doing a ballot of active lanes. 709 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 710 CallInst *const Ballot = 711 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 712 713 // We need to know how many lanes are active within the wavefront that are 714 // below us. If we counted each lane linearly starting from 0, a lane is 715 // below us only if its associated index was less than ours. We do this by 716 // using the mbcnt intrinsic. 717 Value *Mbcnt; 718 if (ST->isWave32()) { 719 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 720 {Ballot, B.getInt32(0)}); 721 } else { 722 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); 723 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); 724 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 725 {ExtractLo, B.getInt32(0)}); 726 Mbcnt = 727 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 728 } 729 730 Function *F = I.getFunction(); 731 LLVMContext &C = F->getContext(); 732 733 // For atomic sub, perform scan with add operation and allow one lane to 734 // subtract the reduced value later. 735 AtomicRMWInst::BinOp ScanOp = Op; 736 if (Op == AtomicRMWInst::Sub) { 737 ScanOp = AtomicRMWInst::Add; 738 } else if (Op == AtomicRMWInst::FSub) { 739 ScanOp = AtomicRMWInst::FAdd; 740 } 741 Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); 742 743 Value *ExclScan = nullptr; 744 Value *NewV = nullptr; 745 746 const bool NeedResult = !I.use_empty(); 747 748 BasicBlock *ComputeLoop = nullptr; 749 BasicBlock *ComputeEnd = nullptr; 750 // If we have a divergent value in each lane, we need to combine the value 751 // using DPP. 752 if (ValDivergent) { 753 if (ScanImpl == ScanOptions::DPP) { 754 // First we need to set all inactive invocations to the identity value, so 755 // that they can correctly contribute to the final result. 756 NewV = 757 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 758 if (!NeedResult && ST->hasPermLaneX16()) { 759 // On GFX10 the permlanex16 instruction helps us build a reduction 760 // without too many readlanes and writelanes, which are generally bad 761 // for performance. 762 NewV = buildReduction(B, ScanOp, NewV, Identity); 763 } else { 764 NewV = buildScan(B, ScanOp, NewV, Identity); 765 if (NeedResult) 766 ExclScan = buildShiftRight(B, NewV, Identity); 767 // Read the value from the last lane, which has accumulated the values 768 // of each active lane in the wavefront. This will be our new value 769 // which we will provide to the atomic operation. 770 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 771 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, 772 {NewV, LastLaneIdx}); 773 } 774 // Finally mark the readlanes in the WWM section. 775 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 776 } else if (ScanImpl == ScanOptions::Iterative) { 777 // Alternative implementation for scan 778 ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); 779 ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); 780 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, 781 ComputeLoop, ComputeEnd); 782 } else { 783 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 784 } 785 } else { 786 switch (Op) { 787 default: 788 llvm_unreachable("Unhandled atomic op"); 789 790 case AtomicRMWInst::Add: 791 case AtomicRMWInst::Sub: { 792 // The new value we will be contributing to the atomic operation is the 793 // old value times the number of active lanes. 794 Value *const Ctpop = B.CreateIntCast( 795 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 796 NewV = buildMul(B, V, Ctpop); 797 break; 798 } 799 case AtomicRMWInst::FAdd: 800 case AtomicRMWInst::FSub: { 801 Value *const Ctpop = B.CreateIntCast( 802 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); 803 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); 804 NewV = B.CreateFMul(V, CtpopFP); 805 break; 806 } 807 case AtomicRMWInst::And: 808 case AtomicRMWInst::Or: 809 case AtomicRMWInst::Max: 810 case AtomicRMWInst::Min: 811 case AtomicRMWInst::UMax: 812 case AtomicRMWInst::UMin: 813 case AtomicRMWInst::FMin: 814 case AtomicRMWInst::FMax: 815 // These operations with a uniform value are idempotent: doing the atomic 816 // operation multiple times has the same effect as doing it once. 817 NewV = V; 818 break; 819 820 case AtomicRMWInst::Xor: 821 // The new value we will be contributing to the atomic operation is the 822 // old value times the parity of the number of active lanes. 823 Value *const Ctpop = B.CreateIntCast( 824 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 825 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 826 break; 827 } 828 } 829 830 // We only want a single lane to enter our new control flow, and we do this 831 // by checking if there are any active lanes below us. Only one lane will 832 // have 0 active lanes below us, so that will be the only one to progress. 833 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); 834 835 // Store I's original basic block before we split the block. 836 BasicBlock *const OriginalBB = I.getParent(); 837 838 // We need to introduce some new control flow to force a single lane to be 839 // active. We do this by splitting I's basic block at I, and introducing the 840 // new block such that: 841 // entry --> single_lane -\ 842 // \------------------> exit 843 Instruction *const SingleLaneTerminator = 844 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); 845 846 // At this point, we have split the I's block to allow one lane in wavefront 847 // to update the precomputed reduced value. Also, completed the codegen for 848 // new control flow i.e. iterative loop which perform reduction and scan using 849 // ComputeLoop and ComputeEnd. 850 // For the new control flow, we need to move branch instruction i.e. 851 // terminator created during SplitBlockAndInsertIfThen from I's block to 852 // ComputeEnd block. We also need to set up predecessor to next block when 853 // single lane done updating the final reduced value. 854 BasicBlock *Predecessor = nullptr; 855 if (ValDivergent && ScanImpl == ScanOptions::Iterative) { 856 // Move terminator from I's block to ComputeEnd block. 857 // 858 // OriginalBB is known to have a branch as terminator because 859 // SplitBlockAndInsertIfThen will have inserted one. 860 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); 861 B.SetInsertPoint(ComputeEnd); 862 Terminator->removeFromParent(); 863 B.Insert(Terminator); 864 865 // Branch to ComputeLoop Block unconditionally from the I's block for 866 // iterative approach. 867 B.SetInsertPoint(OriginalBB); 868 B.CreateBr(ComputeLoop); 869 870 // Update the dominator tree for new control flow. 871 SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates( 872 {{DominatorTree::Insert, OriginalBB, ComputeLoop}, 873 {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); 874 875 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move 876 // the DT edges as well. 877 for (auto *Succ : Terminator->successors()) { 878 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); 879 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); 880 } 881 882 DTU.applyUpdates(DomTreeUpdates); 883 884 Predecessor = ComputeEnd; 885 } else { 886 Predecessor = OriginalBB; 887 } 888 // Move the IR builder into single_lane next. 889 B.SetInsertPoint(SingleLaneTerminator); 890 891 // Clone the original atomic operation into single lane, replacing the 892 // original value with our newly created one. 893 Instruction *const NewI = I.clone(); 894 B.Insert(NewI); 895 NewI->setOperand(ValIdx, NewV); 896 897 // Move the IR builder into exit next, and start inserting just before the 898 // original instruction. 899 B.SetInsertPoint(&I); 900 901 if (NeedResult) { 902 // Create a PHI node to get our new atomic result into the exit block. 903 PHINode *const PHI = B.CreatePHI(Ty, 2); 904 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); 905 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 906 907 // We need to broadcast the value who was the lowest active lane (the first 908 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 909 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 910 Value *BroadcastI = nullptr; 911 BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); 912 913 // Now that we have the result of our single atomic operation, we need to 914 // get our individual lane's slice into the result. We use the lane offset 915 // we previously calculated combined with the atomic result value we got 916 // from the first lane, to get our lane's index into the atomic result. 917 Value *LaneOffset = nullptr; 918 if (ValDivergent) { 919 if (ScanImpl == ScanOptions::DPP) { 920 LaneOffset = 921 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 922 } else if (ScanImpl == ScanOptions::Iterative) { 923 LaneOffset = ExclScan; 924 } else { 925 llvm_unreachable("Atomic Optimzer is disabled for None strategy"); 926 } 927 } else { 928 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) 929 : B.CreateIntCast(Mbcnt, Ty, false); 930 switch (Op) { 931 default: 932 llvm_unreachable("Unhandled atomic op"); 933 case AtomicRMWInst::Add: 934 case AtomicRMWInst::Sub: 935 LaneOffset = buildMul(B, V, Mbcnt); 936 break; 937 case AtomicRMWInst::And: 938 case AtomicRMWInst::Or: 939 case AtomicRMWInst::Max: 940 case AtomicRMWInst::Min: 941 case AtomicRMWInst::UMax: 942 case AtomicRMWInst::UMin: 943 case AtomicRMWInst::FMin: 944 case AtomicRMWInst::FMax: 945 LaneOffset = B.CreateSelect(Cond, Identity, V); 946 break; 947 case AtomicRMWInst::Xor: 948 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 949 break; 950 case AtomicRMWInst::FAdd: 951 case AtomicRMWInst::FSub: { 952 LaneOffset = B.CreateFMul(V, Mbcnt); 953 break; 954 } 955 } 956 } 957 Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 958 if (isAtomicFloatingPointTy) { 959 // For fadd/fsub the first active lane of LaneOffset should be the 960 // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated 961 // is V * +0.0 which might have the wrong sign or might be nan (if V is 962 // inf or nan). 963 // 964 // For all floating point ops if the in-memory value was a nan then the 965 // binop we just built might have quieted it or changed its payload. 966 // 967 // Correct all these problems by using BroadcastI as the result in the 968 // first active lane. 969 Result = B.CreateSelect(Cond, BroadcastI, Result); 970 } 971 972 if (IsPixelShader) { 973 // Need a final PHI to reconverge to above the helper lane branch mask. 974 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); 975 976 PHINode *const PHI = B.CreatePHI(Ty, 2); 977 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); 978 PHI->addIncoming(Result, I.getParent()); 979 I.replaceAllUsesWith(PHI); 980 } else { 981 // Replace the original atomic instruction with the new one. 982 I.replaceAllUsesWith(Result); 983 } 984 } 985 986 // And delete the original. 987 I.eraseFromParent(); 988 } 989 990 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 991 "AMDGPU atomic optimizations", false, false) 992 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 993 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 994 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 995 "AMDGPU atomic optimizations", false, false) 996 997 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { 998 return new AMDGPUAtomicOptimizer(ScanStrategy); 999 } 1000