AMDGPUCodeGenPrepare.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines Matching +full:ulp +full:- +full:allow
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
13 //===----------------------------------------------------------------------===//
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
43   "amdgpu-codegenprepare-widen-constant-loads",
44   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49   "amdgpu-codegenprepare-widen-16-bit-ops",
50   cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
60     ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
66     "amdgpu-codegenprepare-break-large-phis-threshold",
71   "amdgpu-codegenprepare-mul24",
76 // Legalize 64-bit division by using the generic IR expansion.
78   "amdgpu-codegenprepare-expand-div64",
79   cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
86   "amdgpu-codegenprepare-disable-idiv-expansion",
93   "amdgpu-codegenprepare-disable-fdiv-expansion",
121     LLVMContext &Ctx = Mod->getContext();  in getSqrtF32()
131     LLVMContext &Ctx = Mod->getContext();  in getLdexpF32()
225   /// unsigned integer. Truncating to this size and then zero-extending to
230   /// signed integer. Truncating to this size and then sign-extending to
268   //  memory / to a full 32-bits and then truncate the input to allow a scalar
358     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;  in run()
365         BasicBlock *NextInstBB = Next->getParent();  in run()
368           E = BB->end();  in run()
380   if (T->isIntegerTy())  in getBaseElementBitWidth()
381     return T->getIntegerBitWidth();  in getBaseElementBitWidth()
382   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();  in getBaseElementBitWidth()
388   if (T->isIntegerTy())  in getI32Ty()
400       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;  in isSigned()
408   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)  in needsPromotionToI32()
414     if (ST->hasVOP3PInsts())  in needsPromotionToI32()
417     return needsPromotionToI32(VT->getElementType());  in needsPromotionToI32()
424   return Ty->isFloatTy() || Ty->isDoubleTy() ||  in isLegalFloatingTy()
425          (Ty->isHalfTy() && ST->has16BitInsts());  in isLegalFloatingTy()
458   const DataLayout &DL = Mod->getDataLayout();  in canWidenScalarExtLoad()
462   return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);  in canWidenScalarExtLoad()
495       Inst->setHasNoSignedWrap();  in promoteUniformOpToI32()
498       Inst->setHasNoUnsignedWrap();  in promoteUniformOpToI32()
501       Inst->setIsExact(ExactOp->isExact());  in promoteUniformOpToI32()
513   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&  in promoteUniformOpToI32()
519   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());  in promoteUniformOpToI32()
584       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));  in promoteUniformBitreverseToI32()
604   auto *VT = dyn_cast<FixedVectorType>(V->getType());  in extractValues()
610   for (int I = 0, E = VT->getNumElements(); I != E; ++I)  in extractValues()
617   if (!Ty->isVectorTy()) {  in insertValues()
634   unsigned Size = Ty->getScalarSizeInBits();  in replaceMulWithMul24()
635   if (Size <= 16 && ST->has16BitInsts())  in replaceMulWithMul24()
639   if (UA->isUniform(&I))  in replaceMulWithMul24()
650   if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&  in replaceMulWithMul24()
654   } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&  in replaceMulWithMul24()
669   Type *DstTy = LHSVals[0]->getType();  in replaceMulWithMul24()
685   NewVal->takeName(&I);  in replaceMulWithMul24()
700     if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))  in findSelectThroughCast()
717   if (!Sel || !Sel->hasOneUse()) {  in foldBinOpIntoSelect()
722   if (!Sel || !Sel->hasOneUse())  in foldBinOpIntoSelect()
725   Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());  in foldBinOpIntoSelect()
726   Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());  in foldBinOpIntoSelect()
732     if (!CastOp->hasOneUse())  in foldBinOpIntoSelect()
734     CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);  in foldBinOpIntoSelect()
735     CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);  in foldBinOpIntoSelect()
738   // TODO: Handle special 0/-1 cases DAG combine does, although we only really  in foldBinOpIntoSelect()
755     Builder.setFastMathFlags(FPOp->getFastMathFlags());  in foldBinOpIntoSelect()
757   Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),  in foldBinOpIntoSelect()
759   NewSelect->takeName(&BO);  in foldBinOpIntoSelect()
763     CastOp->eraseFromParent();  in foldBinOpIntoSelect()
764   Sel->eraseFromParent();  in foldBinOpIntoSelect()
771   Type *Ty = Src->getType();  in getFrexpResults()
781       ST->hasFractBug()  in getFrexpResults()
788 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
793   // -1.0 / x -> rcp (fneg x)  in emitRcpIEEE1ULP()
800   // Expand as 2^-n * (1.0 / (x * 2^n))  in emitRcpIEEE1ULP()
812 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
819   if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&  in emitFrexpDiv()
833   // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the  in emitFrexpDiv()
839 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
843   Type *Ty = Src->getType();  in emitSqrtIEEE2ULP()
845       APFloat::getSmallestNormalized(Ty->getFltSemantics());  in emitSqrtIEEE2ULP()
858       Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);  in emitSqrtIEEE2ULP()
862 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
865   // bool need_scale = x < 0x1p-126f;  in emitRsqIEEE1ULP()
870   Type *Ty = Src->getType();  in emitRsqIEEE1ULP()
872       APFloat::getSmallestNormalized(Ty->getFltSemantics());  in emitRsqIEEE1ULP()
878       ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);  in emitRsqIEEE1ULP()
885       NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);  in emitRsqIEEE1ULP()
893   // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.  in canOptimizeWithRsq()
897   // v_rsq_f32 gives 1ulp  in canOptimizeWithRsq()
899          SqrtOp->getFPAccuracy() >= 1.0f;  in canOptimizeWithRsq()
905   // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.  in optimizeWithRsq()
908   // rsq_f16 is accurate to 0.51 ulp.  in optimizeWithRsq()
909   // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.  in optimizeWithRsq()
915   assert(Den->getType()->isFloatTy());  in optimizeWithRsq()
920   if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {  in optimizeWithRsq()
928       // -1.0 / sqrt(x) -> fneg(rsq(x))  in optimizeWithRsq()
940 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941 //               allowed with unsafe-fp-math or afn.
943 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
948   // rcp_f16 is accurate to 0.51 ulp.  in optimizeWithRcp()
949   // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.  in optimizeWithRcp()
951   assert(Den->getType()->isFloatTy());  in optimizeWithRcp()
955     if (CLHS->isExactlyValue(1.0) ||  in optimizeWithRcp()
956         (IsNegative = CLHS->isExactlyValue(-1.0))) {  in optimizeWithRcp()
960         // -1.0 / x -> 1.0 / fneg(x)  in optimizeWithRcp()
965         // the CI documentation has a worst case error of 1 ulp.  in optimizeWithRcp()
966         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK  in optimizeWithRcp()
974         // 1.0 / x -> rcp(x)  in optimizeWithRcp()
985     // x / y -> x * (1.0 / y)  in optimizeWithRcp()
1003 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1005 // 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
1010   // fdiv.fast can achieve 2.5 ULP accuracy.  in optimizeWithFDivFast()
1015   assert(Den->getType()->isFloatTy());  in optimizeWithFDivFast()
1019     if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))  in optimizeWithFDivFast()
1063 //   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064 //                 allowed with unsafe-fp-math or afn.
1066 //   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1069 //   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1071 //   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
1078   Type *Ty = FDiv.getType()->getScalarType();  in visitFDiv()
1079   if (!Ty->isFloatTy())  in visitFDiv()
1086   const FastMathFlags DivFMF = FPOp->getFastMathFlags();  in visitFDiv()
1087   const float ReqdAccuracy = FPOp->getFPAccuracy();  in visitFDiv()
1096   if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&  in visitFDiv()
1097       DenII->hasOneUse()) {  in visitFDiv()
1099     SqrtFMF = SqrtOp->getFastMathFlags();  in visitFDiv()
1101       RsqOp = SqrtOp->getOperand(0);  in visitFDiv()
1104   // Inaccurate rcp is allowed with unsafe-fp-math or afn.  in visitFDiv()
1111   // don't need any pre-consideration here when we have better information. A  in visitFDiv()
1150         NewEltInst->copyMetadata(FDiv);  in visitFDiv()
1160     NewVal->takeName(&FDiv);  in visitFDiv()
1168   Attribute Attr = F.getFnAttribute("unsafe-fp-math");  in hasUnsafeFPMath()
1192 /// first one is insufficient. Returns -1 on failure.
1196   const DataLayout &DL = Mod->getDataLayout();  in getDivNumBits()
1199     return -1;  in getDivNumBits()
1203     return -1;  in getDivNumBits()
1206   unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;  in getDivNumBits()
1213 // a 24-bit signed integer.
1218   unsigned SSBits = Num->getType()->getScalarSizeInBits();  in expandDivRem24()
1220   unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);  in expandDivRem24()
1222   if (DivBits == -1)  in expandDivRem24()
1242     // jq = jq >> (bitsize - 2)  in expandDivRem24Impl()
1270   FQ->copyFastMathFlags(Builder.getFastMathFlags());  in expandDivRem24Impl()
1272   // float fqneg = -fq;  in expandDivRem24Impl()
1276   auto FMAD = !ST->hasMadMacF32Insts()  in expandDivRem24Impl()
1280                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);  in expandDivRem24Impl()
1311       int InRegBits = 32 - DivBits;  in expandDivRem24Impl()
1317         = Builder.getInt32((UINT64_C(1) << DivBits) - 1);  in expandDivRem24Impl()
1335     if (C->getType()->getScalarSizeInBits() <= 32)  in divHasSpecialOptimization()
1350     // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2  in divHasSpecialOptimization()
1351     if (BinOpDen->getOpcode() == Instruction::Shl &&  in divHasSpecialOptimization()
1352         isa<Constant>(BinOpDen->getOperand(0)) &&  in divHasSpecialOptimization()
1353         isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,  in divHasSpecialOptimization()
1366     return Constant::getAllOnesValue(V->getType());  in getSign32()
1368     return Constant::getNullValue(V->getType());  in getSign32()
1389   Type *Ty = X->getType();  in expandDivRem32()
1393   if (Ty->getScalarSizeInBits() != 32) {  in expandDivRem32()
1432   //   unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));  in expandDivRem32()
1434   //   // One round of UNR (Unsigned integer Newton-Raphson) to improve z.  in expandDivRem32()
1435   //   // Empirically this is guaranteed to give a "two-y" lower bound on  in expandDivRem32()
1437   //   z += umulh(z, -y * z);  in expandDivRem32()
1441   //   unsigned r = x - q * y;  in expandDivRem32()
1446   //     r -= y;  in expandDivRem32()
1450   //     r -= y;  in expandDivRem32()
1509   if (NumDivBits == -1)  in shrinkDivRem64()
1521     return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :  in shrinkDivRem64()
1522                       Builder.CreateZExt(Narrowed, Num->getType());  in shrinkDivRem64()
1548   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&  in visitBinaryOperator()
1549       UA->isUniform(&I) && promoteUniformOpToI32(I))  in visitBinaryOperator()
1559   unsigned ScalarSize = Ty->getScalarSizeInBits();  in visitBinaryOperator()
1575       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {  in visitBinaryOperator()
1585           // See if this 64-bit division can be shrunk to 32/24-bits before  in visitBinaryOperator()
1589             // The general 64-bit expansion introduces control flow and doesn't  in visitBinaryOperator()
1598           NewEltI->copyIRFlags(&I);  in visitBinaryOperator()
1643     WidenLoad->copyMetadata(I);  in visitLoadInst()
1647     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {  in visitLoadInst()
1649         mdconst::extract<ConstantInt>(Range->getOperand(0));  in visitLoadInst()
1651       if (Lower->isNullValue()) {  in visitLoadInst()
1652         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);  in visitLoadInst()
1655           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),  in visitLoadInst()
1660         WidenLoad->setMetadata(LLVMContext::MD_range,  in visitLoadInst()
1661                                MDNode::get(Mod->getContext(), LowAndHigh));  in visitLoadInst()
1665     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());  in visitLoadInst()
1680   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&  in visitICmpInst()
1681       UA->isUniform(&I))  in visitICmpInst()
1694   if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {  in visitSelectInst()
1695     if (UA->isUniform(&I))  in visitSelectInst()
1709   Builder.setFastMathFlags(FPOp->getFastMathFlags());  in visitSelectInst()
1726   Fract->takeName(&I);  in visitSelectInst()
1735   return IA && IB && IA->getParent() == IB->getParent();  in areInSameBB()
1741   const auto *FVT = dyn_cast<FixedVectorType>(V->getType());  in isInterestingPHIIncomingValue()
1748   BitVector EltsCovered(FVT->getNumElements());  in isInterestingPHIIncomingValue()
1750     const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));  in isInterestingPHIIncomingValue()
1752     // Non constant index/out of bounds index -> folding is unlikely.  in isInterestingPHIIncomingValue()
1755     if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())  in isInterestingPHIIncomingValue()
1758     const auto *VecSrc = IE->getOperand(0);  in isInterestingPHIIncomingValue()
1767     EltsCovered.set(Idx->getZExtValue());  in isInterestingPHIIncomingValue()
1787     return isa<Constant>(SV->getOperand(1)) ||  in isInterestingPHIIncomingValue()
1788            areInSameBB(SV, SV->getOperand(0)) ||  in isInterestingPHIIncomingValue()
1789            areInSameBB(SV, SV->getOperand(1));  in isInterestingPHIIncomingValue()
1816     return It->second;  in canBreakPHINode()
1856     if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {  in canBreakPHINode()
1873 ///   - The type of the slice (Ty)
1874 ///   - The index in the incoming value's vector where the slice starts (Idx)
1875 ///   - The number of elements in the slice (NumElts).
1879 ///   <4 x i64> -> Split into four i64 slices.
1880 ///     -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1881 ///   <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1882 ///     -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1899   ///   - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1902   ///   - We also want to make our extract instructions as local as possible so
1918     IRBuilder<> B(BB->getTerminator());  in getSlicedVal()
1920       B.SetCurrentDebugLocation(IncInst->getDebugLoc());  in getSlicedVal()
1938   // Break-up fixed-vector PHIs into smaller pieces.  in visitPHINode()
1940   // its elements, or into 32-bit pieces (for 8/16 bit elts).  in visitPHINode()
1944   // With large, odd-sized PHIs we may end up needing many `build_vector`  in visitPHINode()
1952   if (!FVT || FVT->getNumElements() == 1 ||  in visitPHINode()
1953       DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)  in visitPHINode()
1961   Type *EltTy = FVT->getElementType();  in visitPHINode()
1965     // 32-bit slices as we can, and scalarize the tail.  in visitPHINode()
1966     const unsigned EltSize = DL->getTypeSizeInBits(EltTy);  in visitPHINode()
1967     const unsigned NumElts = FVT->getNumElements();  in visitPHINode()
1993     B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());  in visitPHINode()
1997       S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),  in visitPHINode()
2035   if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())  in isPtrKnownNeverNull()
2040   if (AS != cast<PointerType>(V->getType())->getAddressSpace())  in isPtrKnownNeverNull()
2050   // address spaces have non-zero null values.  in isPtrKnownNeverNull()
2055   assert((NullVal == 0 || NullVal == -1) &&  in isPtrKnownNeverNull()
2064   if (I.getType()->isVectorTy())  in visitAddrSpaceCastInst()
2113   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&  in visitBitreverseIntrinsicInst()
2114       UA->isUniform(&I))  in visitBitreverseIntrinsicInst()
2120 /// Match non-nan fract pattern.
2121 ///   minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2126   if (ST->hasFractBug())  in matchFractPat()
2133   if (!isLegalFloatingTy(Ty->getScalarType()))  in matchFractPat()
2145   One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);  in matchFractPat()
2147   // Match nextafter(1.0, -1)  in matchFractPat()
2166   Type *Ty = FractArg->getType()->getScalarType();  in applyFractPat()
2172   return insertValues(Builder, FractArg->getType(), ResultVals);  in applyFractPat()
2192   Fract->takeName(&I);  in visitMinNum()
2201   return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;  in isOneOrNegOne()
2204 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2206   Type *Ty = Sqrt.getType()->getScalarType();  in visitSqrt()
2207   if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))  in visitSqrt()
2211   FastMathFlags SqrtFMF = FPOp->getFastMathFlags();  in visitSqrt()
2213   // We're trying to handle the fast-but-not-that-fast case only. The lowering  in visitSqrt()
2218   const float ReqdAccuracy = FPOp->getFPAccuracy();  in visitSqrt()
2229   if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&  in visitSqrt()
2230       FDiv->getFPAccuracy() >= 1.0f &&  in visitSqrt()
2231       canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&  in visitSqrt()
2232       // TODO: We should also handle the arcp case for the fdiv with non-1 value  in visitSqrt()
2233       isOneOrNegOne(FDiv->getOperand(0)))  in visitSqrt()
2239   // The raw instruction is 1 ulp, but the correction for denormal handling  in visitSqrt()
2257   NewSqrt->takeName(&Sqrt);  in visitSqrt()
2265   Impl.DL = &Impl.Mod->getDataLayout();  in doInitialization()
2279   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();  in runOnFunction()
2286   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;  in runOnFunction()
2298   Impl.DL = &Impl.Mod->getDataLayout();  in run()