Lines Matching +full:ulp +full:- +full:allow

1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
13 //===----------------------------------------------------------------------===//
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
66 "amdgpu-codegenprepare-break-large-phis-threshold",
71 "amdgpu-codegenprepare-mul24",
76 // Legalize 64-bit division by using the generic IR expansion.
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
121 LLVMContext &Ctx = Mod->getContext(); in getSqrtF32()
131 LLVMContext &Ctx = Mod->getContext(); in getLdexpF32()
225 /// unsigned integer. Truncating to this size and then zero-extending to
230 /// signed integer. Truncating to this size and then sign-extending to
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
358 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; in run()
365 BasicBlock *NextInstBB = Next->getParent(); in run()
368 E = BB->end(); in run()
380 if (T->isIntegerTy()) in getBaseElementBitWidth()
381 return T->getIntegerBitWidth(); in getBaseElementBitWidth()
382 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); in getBaseElementBitWidth()
388 if (T->isIntegerTy()) in getI32Ty()
400 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; in isSigned()
408 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) in needsPromotionToI32()
414 if (ST->hasVOP3PInsts()) in needsPromotionToI32()
417 return needsPromotionToI32(VT->getElementType()); in needsPromotionToI32()
424 return Ty->isFloatTy() || Ty->isDoubleTy() || in isLegalFloatingTy()
425 (Ty->isHalfTy() && ST->has16BitInsts()); in isLegalFloatingTy()
458 const DataLayout &DL = Mod->getDataLayout(); in canWidenScalarExtLoad()
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I); in canWidenScalarExtLoad()
495 Inst->setHasNoSignedWrap(); in promoteUniformOpToI32()
498 Inst->setHasNoUnsignedWrap(); in promoteUniformOpToI32()
501 Inst->setIsExact(ExactOp->isExact()); in promoteUniformOpToI32()
513 assert(needsPromotionToI32(I.getOperand(0)->getType()) && in promoteUniformOpToI32()
519 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); in promoteUniformOpToI32()
584 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); in promoteUniformBitreverseToI32()
604 auto *VT = dyn_cast<FixedVectorType>(V->getType()); in extractValues()
610 for (int I = 0, E = VT->getNumElements(); I != E; ++I) in extractValues()
617 if (!Ty->isVectorTy()) { in insertValues()
634 unsigned Size = Ty->getScalarSizeInBits(); in replaceMulWithMul24()
635 if (Size <= 16 && ST->has16BitInsts()) in replaceMulWithMul24()
639 if (UA->isUniform(&I)) in replaceMulWithMul24()
650 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && in replaceMulWithMul24()
654 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && in replaceMulWithMul24()
669 Type *DstTy = LHSVals[0]->getType(); in replaceMulWithMul24()
685 NewVal->takeName(&I); in replaceMulWithMul24()
700 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) in findSelectThroughCast()
717 if (!Sel || !Sel->hasOneUse()) { in foldBinOpIntoSelect()
722 if (!Sel || !Sel->hasOneUse()) in foldBinOpIntoSelect()
725 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); in foldBinOpIntoSelect()
726 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); in foldBinOpIntoSelect()
732 if (!CastOp->hasOneUse()) in foldBinOpIntoSelect()
734 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); in foldBinOpIntoSelect()
735 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); in foldBinOpIntoSelect()
738 // TODO: Handle special 0/-1 cases DAG combine does, although we only really in foldBinOpIntoSelect()
755 Builder.setFastMathFlags(FPOp->getFastMathFlags()); in foldBinOpIntoSelect()
757 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), in foldBinOpIntoSelect()
759 NewSelect->takeName(&BO); in foldBinOpIntoSelect()
763 CastOp->eraseFromParent(); in foldBinOpIntoSelect()
764 Sel->eraseFromParent(); in foldBinOpIntoSelect()
771 Type *Ty = Src->getType(); in getFrexpResults()
781 ST->hasFractBug() in getFrexpResults()
788 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
793 // -1.0 / x -> rcp (fneg x) in emitRcpIEEE1ULP()
800 // Expand as 2^-n * (1.0 / (x * 2^n)) in emitRcpIEEE1ULP()
812 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
819 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() && in emitFrexpDiv()
833 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the in emitFrexpDiv()
839 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
843 Type *Ty = Src->getType(); in emitSqrtIEEE2ULP()
845 APFloat::getSmallestNormalized(Ty->getFltSemantics()); in emitSqrtIEEE2ULP()
858 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero); in emitSqrtIEEE2ULP()
862 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
865 // bool need_scale = x < 0x1p-126f; in emitRsqIEEE1ULP()
870 Type *Ty = Src->getType(); in emitRsqIEEE1ULP()
872 APFloat::getSmallestNormalized(Ty->getFltSemantics()); in emitRsqIEEE1ULP()
878 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12); in emitRsqIEEE1ULP()
885 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One); in emitRsqIEEE1ULP()
893 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. in canOptimizeWithRsq()
897 // v_rsq_f32 gives 1ulp in canOptimizeWithRsq()
899 SqrtOp->getFPAccuracy() >= 1.0f; in canOptimizeWithRsq()
905 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. in optimizeWithRsq()
908 // rsq_f16 is accurate to 0.51 ulp. in optimizeWithRsq()
909 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. in optimizeWithRsq()
915 assert(Den->getType()->isFloatTy()); in optimizeWithRsq()
920 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { in optimizeWithRsq()
928 // -1.0 / sqrt(x) -> fneg(rsq(x)) in optimizeWithRsq()
940 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941 // allowed with unsafe-fp-math or afn.
943 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
948 // rcp_f16 is accurate to 0.51 ulp. in optimizeWithRcp()
949 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. in optimizeWithRcp()
951 assert(Den->getType()->isFloatTy()); in optimizeWithRcp()
955 if (CLHS->isExactlyValue(1.0) || in optimizeWithRcp()
956 (IsNegative = CLHS->isExactlyValue(-1.0))) { in optimizeWithRcp()
960 // -1.0 / x -> 1.0 / fneg(x) in optimizeWithRcp()
965 // the CI documentation has a worst case error of 1 ulp. in optimizeWithRcp()
966 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK in optimizeWithRcp()
974 // 1.0 / x -> rcp(x) in optimizeWithRcp()
985 // x / y -> x * (1.0 / y) in optimizeWithRcp()
1003 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1005 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1010 // fdiv.fast can achieve 2.5 ULP accuracy. in optimizeWithFDivFast()
1015 assert(Den->getType()->isFloatTy()); in optimizeWithFDivFast()
1019 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) in optimizeWithFDivFast()
1063 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064 // allowed with unsafe-fp-math or afn.
1066 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1069 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1071 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1078 Type *Ty = FDiv.getType()->getScalarType(); in visitFDiv()
1079 if (!Ty->isFloatTy()) in visitFDiv()
1086 const FastMathFlags DivFMF = FPOp->getFastMathFlags(); in visitFDiv()
1087 const float ReqdAccuracy = FPOp->getFPAccuracy(); in visitFDiv()
1096 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && in visitFDiv()
1097 DenII->hasOneUse()) { in visitFDiv()
1099 SqrtFMF = SqrtOp->getFastMathFlags(); in visitFDiv()
1101 RsqOp = SqrtOp->getOperand(0); in visitFDiv()
1104 // Inaccurate rcp is allowed with unsafe-fp-math or afn. in visitFDiv()
1111 // don't need any pre-consideration here when we have better information. A in visitFDiv()
1150 NewEltInst->copyMetadata(FDiv); in visitFDiv()
1160 NewVal->takeName(&FDiv); in visitFDiv()
1168 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); in hasUnsafeFPMath()
1192 /// first one is insufficient. Returns -1 on failure.
1196 const DataLayout &DL = Mod->getDataLayout(); in getDivNumBits()
1199 return -1; in getDivNumBits()
1203 return -1; in getDivNumBits()
1206 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; in getDivNumBits()
1213 // a 24-bit signed integer.
1218 unsigned SSBits = Num->getType()->getScalarSizeInBits(); in expandDivRem24()
1220 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned); in expandDivRem24()
1222 if (DivBits == -1) in expandDivRem24()
1242 // jq = jq >> (bitsize - 2) in expandDivRem24Impl()
1270 FQ->copyFastMathFlags(Builder.getFastMathFlags()); in expandDivRem24Impl()
1272 // float fqneg = -fq; in expandDivRem24Impl()
1276 auto FMAD = !ST->hasMadMacF32Insts() in expandDivRem24Impl()
1280 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); in expandDivRem24Impl()
1311 int InRegBits = 32 - DivBits; in expandDivRem24Impl()
1317 = Builder.getInt32((UINT64_C(1) << DivBits) - 1); in expandDivRem24Impl()
1335 if (C->getType()->getScalarSizeInBits() <= 32) in divHasSpecialOptimization()
1350 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 in divHasSpecialOptimization()
1351 if (BinOpDen->getOpcode() == Instruction::Shl && in divHasSpecialOptimization()
1352 isa<Constant>(BinOpDen->getOperand(0)) && in divHasSpecialOptimization()
1353 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, in divHasSpecialOptimization()
1366 return Constant::getAllOnesValue(V->getType()); in getSign32()
1368 return Constant::getNullValue(V->getType()); in getSign32()
1389 Type *Ty = X->getType(); in expandDivRem32()
1393 if (Ty->getScalarSizeInBits() != 32) { in expandDivRem32()
1432 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y)); in expandDivRem32()
1434 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z. in expandDivRem32()
1435 // // Empirically this is guaranteed to give a "two-y" lower bound on in expandDivRem32()
1437 // z += umulh(z, -y * z); in expandDivRem32()
1441 // unsigned r = x - q * y; in expandDivRem32()
1446 // r -= y; in expandDivRem32()
1450 // r -= y; in expandDivRem32()
1509 if (NumDivBits == -1) in shrinkDivRem64()
1521 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : in shrinkDivRem64()
1522 Builder.CreateZExt(Narrowed, Num->getType()); in shrinkDivRem64()
1548 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && in visitBinaryOperator()
1549 UA->isUniform(&I) && promoteUniformOpToI32(I)) in visitBinaryOperator()
1559 unsigned ScalarSize = Ty->getScalarSizeInBits(); in visitBinaryOperator()
1575 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { in visitBinaryOperator()
1585 // See if this 64-bit division can be shrunk to 32/24-bits before in visitBinaryOperator()
1589 // The general 64-bit expansion introduces control flow and doesn't in visitBinaryOperator()
1598 NewEltI->copyIRFlags(&I); in visitBinaryOperator()
1643 WidenLoad->copyMetadata(I); in visitLoadInst()
1647 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { in visitLoadInst()
1649 mdconst::extract<ConstantInt>(Range->getOperand(0)); in visitLoadInst()
1651 if (Lower->isNullValue()) { in visitLoadInst()
1652 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); in visitLoadInst()
1655 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), in visitLoadInst()
1660 WidenLoad->setMetadata(LLVMContext::MD_range, in visitLoadInst()
1661 MDNode::get(Mod->getContext(), LowAndHigh)); in visitLoadInst()
1665 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); in visitLoadInst()
1680 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && in visitICmpInst()
1681 UA->isUniform(&I)) in visitICmpInst()
1694 if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) { in visitSelectInst()
1695 if (UA->isUniform(&I)) in visitSelectInst()
1709 Builder.setFastMathFlags(FPOp->getFastMathFlags()); in visitSelectInst()
1726 Fract->takeName(&I); in visitSelectInst()
1735 return IA && IB && IA->getParent() == IB->getParent(); in areInSameBB()
1741 const auto *FVT = dyn_cast<FixedVectorType>(V->getType()); in isInterestingPHIIncomingValue()
1748 BitVector EltsCovered(FVT->getNumElements()); in isInterestingPHIIncomingValue()
1750 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2)); in isInterestingPHIIncomingValue()
1752 // Non constant index/out of bounds index -> folding is unlikely. in isInterestingPHIIncomingValue()
1755 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements()) in isInterestingPHIIncomingValue()
1758 const auto *VecSrc = IE->getOperand(0); in isInterestingPHIIncomingValue()
1767 EltsCovered.set(Idx->getZExtValue()); in isInterestingPHIIncomingValue()
1787 return isa<Constant>(SV->getOperand(1)) || in isInterestingPHIIncomingValue()
1788 areInSameBB(SV, SV->getOperand(0)) || in isInterestingPHIIncomingValue()
1789 areInSameBB(SV, SV->getOperand(1)); in isInterestingPHIIncomingValue()
1816 return It->second; in canBreakPHINode()
1856 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) { in canBreakPHINode()
1873 /// - The type of the slice (Ty)
1874 /// - The index in the incoming value's vector where the slice starts (Idx)
1875 /// - The number of elements in the slice (NumElts).
1879 /// <4 x i64> -> Split into four i64 slices.
1880 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1881 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1882 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1899 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1902 /// - We also want to make our extract instructions as local as possible so
1918 IRBuilder<> B(BB->getTerminator()); in getSlicedVal()
1920 B.SetCurrentDebugLocation(IncInst->getDebugLoc()); in getSlicedVal()
1938 // Break-up fixed-vector PHIs into smaller pieces. in visitPHINode()
1940 // its elements, or into 32-bit pieces (for 8/16 bit elts). in visitPHINode()
1944 // With large, odd-sized PHIs we may end up needing many `build_vector` in visitPHINode()
1952 if (!FVT || FVT->getNumElements() == 1 || in visitPHINode()
1953 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold) in visitPHINode()
1961 Type *EltTy = FVT->getElementType(); in visitPHINode()
1965 // 32-bit slices as we can, and scalarize the tail. in visitPHINode()
1966 const unsigned EltSize = DL->getTypeSizeInBits(EltTy); in visitPHINode()
1967 const unsigned NumElts = FVT->getNumElements(); in visitPHINode()
1993 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt()); in visitPHINode()
1997 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx), in visitPHINode()
2035 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr()) in isPtrKnownNeverNull()
2040 if (AS != cast<PointerType>(V->getType())->getAddressSpace()) in isPtrKnownNeverNull()
2050 // address spaces have non-zero null values. in isPtrKnownNeverNull()
2055 assert((NullVal == 0 || NullVal == -1) && in isPtrKnownNeverNull()
2064 if (I.getType()->isVectorTy()) in visitAddrSpaceCastInst()
2113 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && in visitBitreverseIntrinsicInst()
2114 UA->isUniform(&I)) in visitBitreverseIntrinsicInst()
2120 /// Match non-nan fract pattern.
2121 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2126 if (ST->hasFractBug()) in matchFractPat()
2133 if (!isLegalFloatingTy(Ty->getScalarType())) in matchFractPat()
2145 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo); in matchFractPat()
2147 // Match nextafter(1.0, -1) in matchFractPat()
2166 Type *Ty = FractArg->getType()->getScalarType(); in applyFractPat()
2172 return insertValues(Builder, FractArg->getType(), ResultVals); in applyFractPat()
2192 Fract->takeName(&I); in visitMinNum()
2201 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; in isOneOrNegOne()
2204 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2206 Type *Ty = Sqrt.getType()->getScalarType(); in visitSqrt()
2207 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts())) in visitSqrt()
2211 FastMathFlags SqrtFMF = FPOp->getFastMathFlags(); in visitSqrt()
2213 // We're trying to handle the fast-but-not-that-fast case only. The lowering in visitSqrt()
2218 const float ReqdAccuracy = FPOp->getFPAccuracy(); in visitSqrt()
2229 if (FDiv && FDiv->getOpcode() == Instruction::FDiv && in visitSqrt()
2230 FDiv->getFPAccuracy() >= 1.0f && in visitSqrt()
2231 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && in visitSqrt()
2232 // TODO: We should also handle the arcp case for the fdiv with non-1 value in visitSqrt()
2233 isOneOrNegOne(FDiv->getOperand(0))) in visitSqrt()
2239 // The raw instruction is 1 ulp, but the correction for denormal handling in visitSqrt()
2257 NewSqrt->takeName(&Sqrt); in visitSqrt()
2265 Impl.DL = &Impl.Mod->getDataLayout(); in doInitialization()
2279 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); in runOnFunction()
2286 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr; in runOnFunction()
2298 Impl.DL = &Impl.Mod->getDataLayout(); in run()