Lines Matching +full:single +full:- +full:lane

1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
15 /// 1. DPP -
18 /// 2. Iterative -
22 //===----------------------------------------------------------------------===//
36 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
123 DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, in runOnFunction()
182 switch (Ty->getTypeID()) { in isLegalCrossLaneType()
187 unsigned Size = Ty->getIntegerBitWidth(); in isLegalCrossLaneType()
228 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) { in visitAtomicRMWInst()
235 // If the pointer operand is divergent, then each lane is doing an atomic in visitAtomicRMWInst()
237 if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { in visitAtomicRMWInst()
241 bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); in visitAtomicRMWInst()
243 // If the value operand is divergent, each lane is contributing a different in visitAtomicRMWInst()
248 if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) in visitAtomicRMWInst()
255 // If we get here, we can optimize the atomic using a single wavefront-wide in visitAtomicRMWInst()
327 const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); in visitIntrinsicInst()
329 // If the value operand is divergent, each lane is contributing a different in visitIntrinsicInst()
334 if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) in visitIntrinsicInst()
344 if (UA->isDivergentUse(I.getOperandUse(Idx))) { in visitIntrinsicInst()
349 // If we get here, we can optimize the atomic using a single wavefront-wide in visitIntrinsicInst()
357 // Use the builder to create the non-atomic counterpart of the specified
408 Type *AtomicTy = V->getType(); in buildReduction()
409 Module *M = B.GetInsertBlock()->getModule(); in buildReduction()
423 assert(ST->hasPermLaneX16()); in buildReduction()
425 V->getType(), Intrinsic::amdgcn_permlanex16, in buildReduction()
426 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); in buildReduction()
428 if (ST->isWave32()) { in buildReduction()
432 if (ST->hasPermLane64()) { in buildReduction()
435 B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); in buildReduction()
439 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and in buildReduction()
453 Type *AtomicTy = V->getType(); in buildScan()
454 Module *M = B.GetInsertBlock()->getModule(); in buildScan()
465 if (ST->hasDPPBroadcasts()) { in buildScan()
478 // On GFX10 all DPP operations are confined to a single row. To get cross- in buildScan()
481 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes in buildScan()
483 assert(ST->hasPermLaneX16()); in buildScan()
485 V->getType(), Intrinsic::amdgcn_permlanex16, in buildScan()
486 {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); in buildScan()
493 if (!ST->isWave32()) { in buildScan()
494 // Combine lane 31 into lanes 32..63. in buildScan()
496 V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)}); in buildScan()
512 Type *AtomicTy = V->getType(); in buildShiftRight()
513 Module *M = B.GetInsertBlock()->getModule(); in buildShiftRight()
516 if (ST->hasDPPWavefrontShifts()) { in buildShiftRight()
527 // On GFX10 all DPP operations are confined to a single row. To get cross- in buildShiftRight()
534 // Copy the old lane 15 to the new lane 16. in buildShiftRight()
538 if (!ST->isWave32()) { in buildShiftRight()
539 // Copy the old lane 31 to the new lane 32. in buildShiftRight()
544 // Copy the old lane 47 to the new lane 48. in buildShiftRight()
563 auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); in buildScanIteratively()
574 Accumulator->addIncoming(Identity, EntryBB); in buildScanIteratively()
578 OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); in buildScanIteratively()
581 ActiveBits->addIncoming(Ballot, EntryBB); in buildScanIteratively()
583 // Use llvm.cttz instrinsic to find the lowest remaining active lane. in buildScanIteratively()
590 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane, in buildScanIteratively()
597 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane, in buildScanIteratively()
599 OldValuePhi->addIncoming(OldValue, ComputeLoop); in buildScanIteratively()
604 Accumulator->addIncoming(NewAccumulator, ComputeLoop); in buildScanIteratively()
606 // Set bit to zero of current active lane so that for next iteration llvm.cttz in buildScanIteratively()
607 // return the next active lane in buildScanIteratively()
610 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); in buildScanIteratively()
612 ActiveBits->addIncoming(NewActiveBits, ComputeLoop); in buildScanIteratively()
625 LLVMContext &C = Ty->getContext(); in getIdentityValueForAtomicOp()
626 const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); in getIdentityValueForAtomicOp()
644 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); in getIdentityValueForAtomicOp()
646 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); in getIdentityValueForAtomicOp()
653 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics())); in getIdentityValueForAtomicOp()
659 return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); in buildMul()
670 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); in optimizeAtomic()
674 // lane invocations, we need to record the entry and exit BB's. in optimizeAtomic()
679 // entire atomic operation in a helper-lane check. We do not want any helper in optimizeAtomic()
681 // in any cross-lane communication, and we use a branch on whether the lane is in optimizeAtomic()
700 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); in optimizeAtomic()
701 [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); in optimizeAtomic()
709 Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); in optimizeAtomic()
714 // below us. If we counted each lane linearly starting from 0, a lane is in optimizeAtomic()
718 if (ST->isWave32()) { in optimizeAtomic()
731 LLVMContext &C = F->getContext(); in optimizeAtomic()
733 // For atomic sub, perform scan with add operation and allow one lane to in optimizeAtomic()
750 // If we have a divergent value in each lane, we need to combine the value in optimizeAtomic()
758 if (!NeedResult && ST->hasPermLaneX16()) { in optimizeAtomic()
767 // Read the value from the last lane, which has accumulated the values in optimizeAtomic()
768 // of each active lane in the wavefront. This will be our new value in optimizeAtomic()
770 Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); in optimizeAtomic()
830 // We only want a single lane to enter our new control flow, and we do this in optimizeAtomic()
831 // by checking if there are any active lanes below us. Only one lane will in optimizeAtomic()
838 // We need to introduce some new control flow to force a single lane to be in optimizeAtomic()
841 // entry --> single_lane -\ in optimizeAtomic()
842 // \------------------> exit in optimizeAtomic()
846 // At this point, we have split the I's block to allow one lane in wavefront in optimizeAtomic()
853 // single lane done updating the final reduced value. in optimizeAtomic()
860 BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator()); in optimizeAtomic()
862 Terminator->removeFromParent(); in optimizeAtomic()
877 for (auto *Succ : Terminator->successors()) { in optimizeAtomic()
891 // Clone the original atomic operation into single lane, replacing the in optimizeAtomic()
895 NewI->setOperand(ValIdx, NewV); in optimizeAtomic()
904 PHI->addIncoming(PoisonValue::get(Ty), Predecessor); in optimizeAtomic()
905 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); in optimizeAtomic()
907 // We need to broadcast the value who was the lowest active lane (the first in optimizeAtomic()
908 // lane) to all other lanes in the wavefront. We use an intrinsic for this, in optimizeAtomic()
909 // but have to handle 64-bit broadcasts with two calls to this intrinsic. in optimizeAtomic()
913 // Now that we have the result of our single atomic operation, we need to in optimizeAtomic()
914 // get our individual lane's slice into the result. We use the lane offset in optimizeAtomic()
916 // from the first lane, to get our lane's index into the atomic result. in optimizeAtomic()
959 // For fadd/fsub the first active lane of LaneOffset should be the in optimizeAtomic()
960 // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated in optimizeAtomic()
964 // For all floating point ops if the in-memory value was a nan then the in optimizeAtomic()
968 // first active lane. in optimizeAtomic()
973 // Need a final PHI to reconverge to above the helper lane branch mask. in optimizeAtomic()
974 B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); in optimizeAtomic()
977 PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); in optimizeAtomic()
978 PHI->addIncoming(Result, I.getParent()); in optimizeAtomic()