AMDGPUAtomicOptimizer.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines Matching +full:single +full:- +full:lane
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
15 /// 1. DPP -
18 /// 2. Iterative -
22 //===----------------------------------------------------------------------===//
36 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
123   DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,  in runOnFunction()
182   switch (Ty->getTypeID()) {  in isLegalCrossLaneType()
187     unsigned Size = Ty->getIntegerBitWidth();  in isLegalCrossLaneType()
228       !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {  in visitAtomicRMWInst()
235   // If the pointer operand is divergent, then each lane is doing an atomic  in visitAtomicRMWInst()
237   if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {  in visitAtomicRMWInst()
241   bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));  in visitAtomicRMWInst()
243   // If the value operand is divergent, each lane is contributing a different  in visitAtomicRMWInst()
248     if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())  in visitAtomicRMWInst()
255   // If we get here, we can optimize the atomic using a single wavefront-wide  in visitAtomicRMWInst()
327   const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));  in visitIntrinsicInst()
329   // If the value operand is divergent, each lane is contributing a different  in visitIntrinsicInst()
334     if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())  in visitIntrinsicInst()
344     if (UA->isDivergentUse(I.getOperandUse(Idx))) {  in visitIntrinsicInst()
349   // If we get here, we can optimize the atomic using a single wavefront-wide  in visitIntrinsicInst()
357 // Use the builder to create the non-atomic counterpart of the specified
408   Type *AtomicTy = V->getType();  in buildReduction()
409   Module *M = B.GetInsertBlock()->getModule();  in buildReduction()
423   assert(ST->hasPermLaneX16());  in buildReduction()
425       V->getType(), Intrinsic::amdgcn_permlanex16,  in buildReduction()
426       {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});  in buildReduction()
428   if (ST->isWave32()) {  in buildReduction()
432   if (ST->hasPermLane64()) {  in buildReduction()
435         B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);  in buildReduction()
439   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and  in buildReduction()
453   Type *AtomicTy = V->getType();  in buildScan()
454   Module *M = B.GetInsertBlock()->getModule();  in buildScan()
465   if (ST->hasDPPBroadcasts()) {  in buildScan()
478     // On GFX10 all DPP operations are confined to a single row. To get cross-  in buildScan()
481     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes  in buildScan()
483     assert(ST->hasPermLaneX16());  in buildScan()
485         V->getType(), Intrinsic::amdgcn_permlanex16,  in buildScan()
486         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});  in buildScan()
493     if (!ST->isWave32()) {  in buildScan()
494       // Combine lane 31 into lanes 32..63.  in buildScan()
496           V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});  in buildScan()
512   Type *AtomicTy = V->getType();  in buildShiftRight()
513   Module *M = B.GetInsertBlock()->getModule();  in buildShiftRight()
516   if (ST->hasDPPWavefrontShifts()) {  in buildShiftRight()
527     // On GFX10 all DPP operations are confined to a single row. To get cross-  in buildShiftRight()
534     // Copy the old lane 15 to the new lane 16.  in buildShiftRight()
538     if (!ST->isWave32()) {  in buildShiftRight()
539       // Copy the old lane 31 to the new lane 32.  in buildShiftRight()
544       // Copy the old lane 47 to the new lane 48.  in buildShiftRight()
563   auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());  in buildScanIteratively()
574   Accumulator->addIncoming(Identity, EntryBB);  in buildScanIteratively()
578     OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);  in buildScanIteratively()
581   ActiveBits->addIncoming(Ballot, EntryBB);  in buildScanIteratively()
583   // Use llvm.cttz instrinsic to find the lowest remaining active lane.  in buildScanIteratively()
590   Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,  in buildScanIteratively()
597     OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,  in buildScanIteratively()
599     OldValuePhi->addIncoming(OldValue, ComputeLoop);  in buildScanIteratively()
604   Accumulator->addIncoming(NewAccumulator, ComputeLoop);  in buildScanIteratively()
606   // Set bit to zero of current active lane so that for next iteration llvm.cttz  in buildScanIteratively()
607   // return the next active lane  in buildScanIteratively()
610   auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));  in buildScanIteratively()
612   ActiveBits->addIncoming(NewActiveBits, ComputeLoop);  in buildScanIteratively()
625   LLVMContext &C = Ty->getContext();  in getIdentityValueForAtomicOp()
626   const unsigned BitWidth = Ty->getPrimitiveSizeInBits();  in getIdentityValueForAtomicOp()
644     return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));  in getIdentityValueForAtomicOp()
646     return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));  in getIdentityValueForAtomicOp()
653     return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics()));  in getIdentityValueForAtomicOp()
659   return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);  in buildMul()
670     B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));  in optimizeAtomic()
674   // lane invocations, we need to record the entry and exit BB's.  in optimizeAtomic()
679   // entire atomic operation in a helper-lane check. We do not want any helper  in optimizeAtomic()
681   // in any cross-lane communication, and we use a branch on whether the lane is  in optimizeAtomic()
700   bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();  in optimizeAtomic()
701   [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);  in optimizeAtomic()
709   Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());  in optimizeAtomic()
714   // below us. If we counted each lane linearly starting from 0, a lane is  in optimizeAtomic()
718   if (ST->isWave32()) {  in optimizeAtomic()
731   LLVMContext &C = F->getContext();  in optimizeAtomic()
733   // For atomic sub, perform scan with add operation and allow one lane to  in optimizeAtomic()
750   // If we have a divergent value in each lane, we need to combine the value  in optimizeAtomic()
758       if (!NeedResult && ST->hasPermLaneX16()) {  in optimizeAtomic()
767         // Read the value from the last lane, which has accumulated the values  in optimizeAtomic()
768         // of each active lane in the wavefront. This will be our new value  in optimizeAtomic()
770         Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);  in optimizeAtomic()
830   // We only want a single lane to enter our new control flow, and we do this  in optimizeAtomic()
831   // by checking if there are any active lanes below us. Only one lane will  in optimizeAtomic()
838   // We need to introduce some new control flow to force a single lane to be  in optimizeAtomic()
841   // entry --> single_lane -\  in optimizeAtomic()
842   //       \------------------> exit  in optimizeAtomic()
846   // At this point, we have split the I's block to allow one lane in wavefront  in optimizeAtomic()
853   // single lane done updating the final reduced value.  in optimizeAtomic()
860     BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());  in optimizeAtomic()
862     Terminator->removeFromParent();  in optimizeAtomic()
877     for (auto *Succ : Terminator->successors()) {  in optimizeAtomic()
891   // Clone the original atomic operation into single lane, replacing the  in optimizeAtomic()
895   NewI->setOperand(ValIdx, NewV);  in optimizeAtomic()
904     PHI->addIncoming(PoisonValue::get(Ty), Predecessor);  in optimizeAtomic()
905     PHI->addIncoming(NewI, SingleLaneTerminator->getParent());  in optimizeAtomic()
907     // We need to broadcast the value who was the lowest active lane (the first  in optimizeAtomic()
908     // lane) to all other lanes in the wavefront. We use an intrinsic for this,  in optimizeAtomic()
909     // but have to handle 64-bit broadcasts with two calls to this intrinsic.  in optimizeAtomic()
913     // Now that we have the result of our single atomic operation, we need to  in optimizeAtomic()
914     // get our individual lane's slice into the result. We use the lane offset  in optimizeAtomic()
916     // from the first lane, to get our lane's index into the atomic result.  in optimizeAtomic()
959       // For fadd/fsub the first active lane of LaneOffset should be the  in optimizeAtomic()
960       // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated  in optimizeAtomic()
964       // For all floating point ops if the in-memory value was a nan then the  in optimizeAtomic()
968       // first active lane.  in optimizeAtomic()
973       // Need a final PHI to reconverge to above the helper lane branch mask.  in optimizeAtomic()
974       B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());  in optimizeAtomic()
977       PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);  in optimizeAtomic()
978       PHI->addIncoming(Result, I.getParent());  in optimizeAtomic()
In current file

In project "undefined"

On Google