10b57cec5SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR before instruction 110b57cec5SDimitry Andric /// selection. 120b57cec5SDimitry Andric // 130b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 140b57cec5SDimitry Andric 150b57cec5SDimitry Andric #include "AMDGPU.h" 160b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 17*06c3fb27SDimitry Andric #include "SIModeRegisterDefaults.h" 180b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 195ffd83dbSDimitry Andric #include "llvm/Analysis/ConstantFolding.h" 20*06c3fb27SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h" 21*06c3fb27SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h" 220b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 230b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 245ffd83dbSDimitry Andric #include "llvm/IR/Dominators.h" 25*06c3fb27SDimitry Andric #include "llvm/IR/IRBuilder.h" 260b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 27e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 28*06c3fb27SDimitry Andric #include "llvm/IR/PatternMatch.h" 29480093f4SDimitry Andric #include "llvm/InitializePasses.h" 300b57cec5SDimitry Andric #include "llvm/Pass.h" 31e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 325ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/IntegerDivision.h" 33*06c3fb27SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 340b57cec5SDimitry Andric 350b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-codegenprepare" 360b57cec5SDimitry Andric 370b57cec5SDimitry Andric using namespace llvm; 38*06c3fb27SDimitry Andric using namespace llvm::PatternMatch; 390b57cec5SDimitry Andric 400b57cec5SDimitry Andric namespace { 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric static cl::opt<bool> WidenLoads( 430b57cec5SDimitry Andric "amdgpu-codegenprepare-widen-constant-loads", 440b57cec5SDimitry Andric cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 450b57cec5SDimitry Andric cl::ReallyHidden, 465ffd83dbSDimitry Andric cl::init(false)); 470b57cec5SDimitry Andric 48e8d8bef9SDimitry Andric static cl::opt<bool> Widen16BitOps( 49e8d8bef9SDimitry Andric "amdgpu-codegenprepare-widen-16-bit-ops", 50e8d8bef9SDimitry Andric cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), 51e8d8bef9SDimitry Andric cl::ReallyHidden, 52e8d8bef9SDimitry Andric cl::init(true)); 53e8d8bef9SDimitry Andric 54*06c3fb27SDimitry Andric static cl::opt<bool> 55*06c3fb27SDimitry Andric ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis", 56*06c3fb27SDimitry Andric cl::desc("Break large PHI nodes for DAGISel"), 57*06c3fb27SDimitry Andric cl::ReallyHidden, cl::init(true)); 58*06c3fb27SDimitry Andric 59*06c3fb27SDimitry Andric static cl::opt<bool> 60*06c3fb27SDimitry Andric ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis", 61*06c3fb27SDimitry Andric cl::desc("For testing purposes, always break large " 62*06c3fb27SDimitry Andric "PHIs even if it isn't profitable."), 63*06c3fb27SDimitry Andric cl::ReallyHidden, cl::init(false)); 64*06c3fb27SDimitry Andric 65*06c3fb27SDimitry Andric static cl::opt<unsigned> ScalarizeLargePHIsThreshold( 66*06c3fb27SDimitry Andric "amdgpu-codegenprepare-break-large-phis-threshold", 67*06c3fb27SDimitry Andric cl::desc("Minimum type size in bits for breaking large PHI nodes"), 68*06c3fb27SDimitry Andric cl::ReallyHidden, cl::init(32)); 69*06c3fb27SDimitry Andric 708bcb0991SDimitry Andric static cl::opt<bool> UseMul24Intrin( 718bcb0991SDimitry Andric "amdgpu-codegenprepare-mul24", 728bcb0991SDimitry Andric cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), 738bcb0991SDimitry Andric cl::ReallyHidden, 748bcb0991SDimitry Andric cl::init(true)); 758bcb0991SDimitry Andric 765ffd83dbSDimitry Andric // Legalize 64-bit division by using the generic IR expansion. 775ffd83dbSDimitry Andric static cl::opt<bool> ExpandDiv64InIR( 785ffd83dbSDimitry Andric "amdgpu-codegenprepare-expand-div64", 795ffd83dbSDimitry Andric cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), 805ffd83dbSDimitry Andric cl::ReallyHidden, 815ffd83dbSDimitry Andric cl::init(false)); 825ffd83dbSDimitry Andric 835ffd83dbSDimitry Andric // Leave all division operations as they are. This supersedes ExpandDiv64InIR 845ffd83dbSDimitry Andric // and is used for testing the legalizer. 855ffd83dbSDimitry Andric static cl::opt<bool> DisableIDivExpand( 865ffd83dbSDimitry Andric "amdgpu-codegenprepare-disable-idiv-expansion", 875ffd83dbSDimitry Andric cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), 885ffd83dbSDimitry Andric cl::ReallyHidden, 895ffd83dbSDimitry Andric cl::init(false)); 905ffd83dbSDimitry Andric 91*06c3fb27SDimitry Andric // Disable processing of fdiv so we can better test the backend implementations. 92*06c3fb27SDimitry Andric static cl::opt<bool> DisableFDivExpand( 93*06c3fb27SDimitry Andric "amdgpu-codegenprepare-disable-fdiv-expansion", 94*06c3fb27SDimitry Andric cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"), 95*06c3fb27SDimitry Andric cl::ReallyHidden, 96*06c3fb27SDimitry Andric cl::init(false)); 97*06c3fb27SDimitry Andric 98*06c3fb27SDimitry Andric class AMDGPUCodeGenPrepareImpl 99*06c3fb27SDimitry Andric : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { 100*06c3fb27SDimitry Andric public: 1010b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr; 102*06c3fb27SDimitry Andric const TargetLibraryInfo *TLInfo = nullptr; 1030b57cec5SDimitry Andric AssumptionCache *AC = nullptr; 1045ffd83dbSDimitry Andric DominatorTree *DT = nullptr; 105*06c3fb27SDimitry Andric UniformityInfo *UA = nullptr; 1060b57cec5SDimitry Andric Module *Mod = nullptr; 1070b57cec5SDimitry Andric const DataLayout *DL = nullptr; 1080b57cec5SDimitry Andric bool HasUnsafeFPMath = false; 109*06c3fb27SDimitry Andric bool HasFP32DenormalFlush = false; 110*06c3fb27SDimitry Andric bool FlowChanged = false; 111*06c3fb27SDimitry Andric 112*06c3fb27SDimitry Andric DenseMap<const PHINode *, bool> BreakPhiNodesCache; 113*06c3fb27SDimitry Andric 114*06c3fb27SDimitry Andric bool canBreakPHINode(const PHINode &I); 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 1170b57cec5SDimitry Andric /// binary operation \p V. 1180b57cec5SDimitry Andric /// 1190b57cec5SDimitry Andric /// \returns Binary operation \p V. 1200b57cec5SDimitry Andric /// \returns \p T's base element bit width. 1210b57cec5SDimitry Andric unsigned getBaseElementBitWidth(const Type *T) const; 1220b57cec5SDimitry Andric 1230b57cec5SDimitry Andric /// \returns Equivalent 32 bit integer type for given type \p T. For example, 1240b57cec5SDimitry Andric /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 1250b57cec5SDimitry Andric /// is returned. 1260b57cec5SDimitry Andric Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric /// \returns True if binary operation \p I is a signed binary operation, false 1290b57cec5SDimitry Andric /// otherwise. 1300b57cec5SDimitry Andric bool isSigned(const BinaryOperator &I) const; 1310b57cec5SDimitry Andric 1320b57cec5SDimitry Andric /// \returns True if the condition of 'select' operation \p I comes from a 1330b57cec5SDimitry Andric /// signed 'icmp' operation, false otherwise. 1340b57cec5SDimitry Andric bool isSigned(const SelectInst &I) const; 1350b57cec5SDimitry Andric 1360b57cec5SDimitry Andric /// \returns True if type \p T needs to be promoted to 32 bit integer type, 1370b57cec5SDimitry Andric /// false otherwise. 1380b57cec5SDimitry Andric bool needsPromotionToI32(const Type *T) const; 1390b57cec5SDimitry Andric 140*06c3fb27SDimitry Andric /// Return true if \p T is a legal scalar floating point type. 141*06c3fb27SDimitry Andric bool isLegalFloatingTy(const Type *T) const; 142*06c3fb27SDimitry Andric 143*06c3fb27SDimitry Andric /// Wrapper to pass all the arguments to computeKnownFPClass 144*06c3fb27SDimitry Andric KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, 145*06c3fb27SDimitry Andric const Instruction *CtxI) const { 146*06c3fb27SDimitry Andric return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI, 147*06c3fb27SDimitry Andric DT); 148*06c3fb27SDimitry Andric } 149*06c3fb27SDimitry Andric 150*06c3fb27SDimitry Andric bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { 151*06c3fb27SDimitry Andric return HasFP32DenormalFlush || 152*06c3fb27SDimitry Andric computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); 153*06c3fb27SDimitry Andric } 154*06c3fb27SDimitry Andric 1550b57cec5SDimitry Andric /// Promotes uniform binary operation \p I to equivalent 32 bit binary 1560b57cec5SDimitry Andric /// operation. 1570b57cec5SDimitry Andric /// 1580b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 1590b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 1600b57cec5SDimitry Andric /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 1610b57cec5SDimitry Andric /// truncating the result of 32 bit binary operation back to \p I's original 1620b57cec5SDimitry Andric /// type. Division operation is not promoted. 1630b57cec5SDimitry Andric /// 1640b57cec5SDimitry Andric /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 1650b57cec5SDimitry Andric /// false otherwise. 1660b57cec5SDimitry Andric bool promoteUniformOpToI32(BinaryOperator &I) const; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 1690b57cec5SDimitry Andric /// 1700b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 1710b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 1720b57cec5SDimitry Andric /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 1730b57cec5SDimitry Andric /// 1740b57cec5SDimitry Andric /// \returns True. 1750b57cec5SDimitry Andric bool promoteUniformOpToI32(ICmpInst &I) const; 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric /// Promotes uniform 'select' operation \p I to 32 bit 'select' 1780b57cec5SDimitry Andric /// operation. 1790b57cec5SDimitry Andric /// 1800b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 1810b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 1820b57cec5SDimitry Andric /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 1830b57cec5SDimitry Andric /// result of 32 bit 'select' operation back to \p I's original type. 1840b57cec5SDimitry Andric /// 1850b57cec5SDimitry Andric /// \returns True. 1860b57cec5SDimitry Andric bool promoteUniformOpToI32(SelectInst &I) const; 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 1890b57cec5SDimitry Andric /// intrinsic. 1900b57cec5SDimitry Andric /// 1910b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 1920b57cec5SDimitry Andric /// than or equal 16. Promotion is done by zero extending the operand to 32 1930b57cec5SDimitry Andric /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 1940b57cec5SDimitry Andric /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 1950b57cec5SDimitry Andric /// shift amount is 32 minus \p I's base element bit width), and truncating 1960b57cec5SDimitry Andric /// the result of the shift operation back to \p I's original type. 1970b57cec5SDimitry Andric /// 1980b57cec5SDimitry Andric /// \returns True. 1990b57cec5SDimitry Andric bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 2000b57cec5SDimitry Andric 201349cc55cSDimitry Andric /// \returns The minimum number of bits needed to store the value of \Op as an 202349cc55cSDimitry Andric /// unsigned integer. Truncating to this size and then zero-extending to 20304eeddc0SDimitry Andric /// the original will not change the value. 20404eeddc0SDimitry Andric unsigned numBitsUnsigned(Value *Op) const; 205349cc55cSDimitry Andric 206349cc55cSDimitry Andric /// \returns The minimum number of bits needed to store the value of \Op as a 207349cc55cSDimitry Andric /// signed integer. Truncating to this size and then sign-extending to 20804eeddc0SDimitry Andric /// the original size will not change the value. 20904eeddc0SDimitry Andric unsigned numBitsSigned(Value *Op) const; 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. 2120b57cec5SDimitry Andric /// SelectionDAG has an issue where an and asserting the bits are known 2130b57cec5SDimitry Andric bool replaceMulWithMul24(BinaryOperator &I) const; 2140b57cec5SDimitry Andric 2155ffd83dbSDimitry Andric /// Perform same function as equivalently named function in DAGCombiner. Since 2165ffd83dbSDimitry Andric /// we expand some divisions here, we need to perform this before obscuring. 2175ffd83dbSDimitry Andric bool foldBinOpIntoSelect(BinaryOperator &I) const; 2185ffd83dbSDimitry Andric 2195ffd83dbSDimitry Andric bool divHasSpecialOptimization(BinaryOperator &I, 2205ffd83dbSDimitry Andric Value *Num, Value *Den) const; 2215ffd83dbSDimitry Andric int getDivNumBits(BinaryOperator &I, 2225ffd83dbSDimitry Andric Value *Num, Value *Den, 2235ffd83dbSDimitry Andric unsigned AtLeast, bool Signed) const; 2245ffd83dbSDimitry Andric 2250b57cec5SDimitry Andric /// Expands 24 bit div or rem. 2260b57cec5SDimitry Andric Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 2270b57cec5SDimitry Andric Value *Num, Value *Den, 2280b57cec5SDimitry Andric bool IsDiv, bool IsSigned) const; 2290b57cec5SDimitry Andric 2305ffd83dbSDimitry Andric Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, 2315ffd83dbSDimitry Andric Value *Num, Value *Den, unsigned NumBits, 2325ffd83dbSDimitry Andric bool IsDiv, bool IsSigned) const; 2335ffd83dbSDimitry Andric 2340b57cec5SDimitry Andric /// Expands 32 bit div or rem. 2350b57cec5SDimitry Andric Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 2360b57cec5SDimitry Andric Value *Num, Value *Den) const; 2370b57cec5SDimitry Andric 2385ffd83dbSDimitry Andric Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, 2395ffd83dbSDimitry Andric Value *Num, Value *Den) const; 2405ffd83dbSDimitry Andric void expandDivRem64(BinaryOperator &I) const; 2415ffd83dbSDimitry Andric 2420b57cec5SDimitry Andric /// Widen a scalar load. 2430b57cec5SDimitry Andric /// 2440b57cec5SDimitry Andric /// \details \p Widen scalar load for uniform, small type loads from constant 2450b57cec5SDimitry Andric // memory / to a full 32-bits and then truncate the input to allow a scalar 2460b57cec5SDimitry Andric // load instead of a vector load. 2470b57cec5SDimitry Andric // 2480b57cec5SDimitry Andric /// \returns True. 2490b57cec5SDimitry Andric 2500b57cec5SDimitry Andric bool canWidenScalarExtLoad(LoadInst &I) const; 2510b57cec5SDimitry Andric 252*06c3fb27SDimitry Andric Value *matchFractPat(IntrinsicInst &I); 253*06c3fb27SDimitry Andric Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); 254*06c3fb27SDimitry Andric 255*06c3fb27SDimitry Andric bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, 256*06c3fb27SDimitry Andric FastMathFlags SqrtFMF) const; 257*06c3fb27SDimitry Andric 258*06c3fb27SDimitry Andric Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, 259*06c3fb27SDimitry Andric FastMathFlags DivFMF, FastMathFlags SqrtFMF, 260*06c3fb27SDimitry Andric const Instruction *CtxI) const; 261*06c3fb27SDimitry Andric 262*06c3fb27SDimitry Andric Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den, 263*06c3fb27SDimitry Andric FastMathFlags FMF, const Instruction *CtxI) const; 264*06c3fb27SDimitry Andric Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den, 265*06c3fb27SDimitry Andric float ReqdAccuracy) const; 266*06c3fb27SDimitry Andric 267*06c3fb27SDimitry Andric Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den, 268*06c3fb27SDimitry Andric FastMathFlags DivFMF, FastMathFlags SqrtFMF, 269*06c3fb27SDimitry Andric Value *RsqOp, const Instruction *FDiv, 270*06c3fb27SDimitry Andric float ReqdAccuracy) const; 271*06c3fb27SDimitry Andric 272*06c3fb27SDimitry Andric std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder, 273*06c3fb27SDimitry Andric Value *Src) const; 274*06c3fb27SDimitry Andric 275*06c3fb27SDimitry Andric Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, 276*06c3fb27SDimitry Andric bool IsNegative) const; 277*06c3fb27SDimitry Andric Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, 278*06c3fb27SDimitry Andric FastMathFlags FMF) const; 279*06c3fb27SDimitry Andric 2800b57cec5SDimitry Andric public: 2810b57cec5SDimitry Andric bool visitFDiv(BinaryOperator &I); 2820b57cec5SDimitry Andric 2830b57cec5SDimitry Andric bool visitInstruction(Instruction &I) { return false; } 2840b57cec5SDimitry Andric bool visitBinaryOperator(BinaryOperator &I); 2850b57cec5SDimitry Andric bool visitLoadInst(LoadInst &I); 2860b57cec5SDimitry Andric bool visitICmpInst(ICmpInst &I); 2870b57cec5SDimitry Andric bool visitSelectInst(SelectInst &I); 288*06c3fb27SDimitry Andric bool visitPHINode(PHINode &I); 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric bool visitIntrinsicInst(IntrinsicInst &I); 2910b57cec5SDimitry Andric bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 292*06c3fb27SDimitry Andric bool visitMinNum(IntrinsicInst &I); 293*06c3fb27SDimitry Andric bool run(Function &F); 294*06c3fb27SDimitry Andric }; 2950b57cec5SDimitry Andric 296*06c3fb27SDimitry Andric class AMDGPUCodeGenPrepare : public FunctionPass { 297*06c3fb27SDimitry Andric private: 298*06c3fb27SDimitry Andric AMDGPUCodeGenPrepareImpl Impl; 2990b57cec5SDimitry Andric 300*06c3fb27SDimitry Andric public: 301*06c3fb27SDimitry Andric static char ID; 302*06c3fb27SDimitry Andric AMDGPUCodeGenPrepare() : FunctionPass(ID) { 303*06c3fb27SDimitry Andric initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry()); 304*06c3fb27SDimitry Andric } 3050b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 3060b57cec5SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 307*06c3fb27SDimitry Andric AU.addRequired<UniformityInfoWrapperPass>(); 308*06c3fb27SDimitry Andric AU.addRequired<TargetLibraryInfoWrapperPass>(); 3095ffd83dbSDimitry Andric 3105ffd83dbSDimitry Andric // FIXME: Division expansion needs to preserve the dominator tree. 3115ffd83dbSDimitry Andric if (!ExpandDiv64InIR) 3120b57cec5SDimitry Andric AU.setPreservesAll(); 3130b57cec5SDimitry Andric } 314*06c3fb27SDimitry Andric bool runOnFunction(Function &F) override; 315*06c3fb27SDimitry Andric bool doInitialization(Module &M) override; 316*06c3fb27SDimitry Andric StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 3170b57cec5SDimitry Andric }; 3180b57cec5SDimitry Andric 3190b57cec5SDimitry Andric } // end anonymous namespace 3200b57cec5SDimitry Andric 321*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::run(Function &F) { 322*06c3fb27SDimitry Andric bool MadeChange = false; 323*06c3fb27SDimitry Andric 324*06c3fb27SDimitry Andric Function::iterator NextBB; 325*06c3fb27SDimitry Andric for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { 326*06c3fb27SDimitry Andric BasicBlock *BB = &*FI; 327*06c3fb27SDimitry Andric NextBB = std::next(FI); 328*06c3fb27SDimitry Andric 329*06c3fb27SDimitry Andric BasicBlock::iterator Next; 330*06c3fb27SDimitry Andric for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; 331*06c3fb27SDimitry Andric I = Next) { 332*06c3fb27SDimitry Andric Next = std::next(I); 333*06c3fb27SDimitry Andric 334*06c3fb27SDimitry Andric MadeChange |= visit(*I); 335*06c3fb27SDimitry Andric 336*06c3fb27SDimitry Andric if (Next != E) { // Control flow changed 337*06c3fb27SDimitry Andric BasicBlock *NextInstBB = Next->getParent(); 338*06c3fb27SDimitry Andric if (NextInstBB != BB) { 339*06c3fb27SDimitry Andric BB = NextInstBB; 340*06c3fb27SDimitry Andric E = BB->end(); 341*06c3fb27SDimitry Andric FE = F.end(); 342*06c3fb27SDimitry Andric } 343*06c3fb27SDimitry Andric } 344*06c3fb27SDimitry Andric } 345*06c3fb27SDimitry Andric } 346*06c3fb27SDimitry Andric return MadeChange; 347*06c3fb27SDimitry Andric } 348*06c3fb27SDimitry Andric 349*06c3fb27SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { 3500b57cec5SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 3510b57cec5SDimitry Andric 3520b57cec5SDimitry Andric if (T->isIntegerTy()) 3530b57cec5SDimitry Andric return T->getIntegerBitWidth(); 3540b57cec5SDimitry Andric return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 3550b57cec5SDimitry Andric } 3560b57cec5SDimitry Andric 357*06c3fb27SDimitry Andric Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { 3580b57cec5SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 3590b57cec5SDimitry Andric 3600b57cec5SDimitry Andric if (T->isIntegerTy()) 3610b57cec5SDimitry Andric return B.getInt32Ty(); 3625ffd83dbSDimitry Andric return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); 3630b57cec5SDimitry Andric } 3640b57cec5SDimitry Andric 365*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { 3660b57cec5SDimitry Andric return I.getOpcode() == Instruction::AShr || 3670b57cec5SDimitry Andric I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 3680b57cec5SDimitry Andric } 3690b57cec5SDimitry Andric 370*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { 3710b57cec5SDimitry Andric return isa<ICmpInst>(I.getOperand(0)) ? 3720b57cec5SDimitry Andric cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 3730b57cec5SDimitry Andric } 3740b57cec5SDimitry Andric 375*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { 376e8d8bef9SDimitry Andric if (!Widen16BitOps) 377e8d8bef9SDimitry Andric return false; 378e8d8bef9SDimitry Andric 3790b57cec5SDimitry Andric const IntegerType *IntTy = dyn_cast<IntegerType>(T); 3800b57cec5SDimitry Andric if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 3810b57cec5SDimitry Andric return true; 3820b57cec5SDimitry Andric 3830b57cec5SDimitry Andric if (const VectorType *VT = dyn_cast<VectorType>(T)) { 3840b57cec5SDimitry Andric // TODO: The set of packed operations is more limited, so may want to 3850b57cec5SDimitry Andric // promote some anyway. 3860b57cec5SDimitry Andric if (ST->hasVOP3PInsts()) 3870b57cec5SDimitry Andric return false; 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric return needsPromotionToI32(VT->getElementType()); 3900b57cec5SDimitry Andric } 3910b57cec5SDimitry Andric 3920b57cec5SDimitry Andric return false; 3930b57cec5SDimitry Andric } 3940b57cec5SDimitry Andric 395*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { 396*06c3fb27SDimitry Andric return Ty->isFloatTy() || Ty->isDoubleTy() || 397*06c3fb27SDimitry Andric (Ty->isHalfTy() && ST->has16BitInsts()); 398*06c3fb27SDimitry Andric } 399*06c3fb27SDimitry Andric 4000b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nsw set. 4010b57cec5SDimitry Andric static bool promotedOpIsNSW(const Instruction &I) { 4020b57cec5SDimitry Andric switch (I.getOpcode()) { 4030b57cec5SDimitry Andric case Instruction::Shl: 4040b57cec5SDimitry Andric case Instruction::Add: 4050b57cec5SDimitry Andric case Instruction::Sub: 4060b57cec5SDimitry Andric return true; 4070b57cec5SDimitry Andric case Instruction::Mul: 4080b57cec5SDimitry Andric return I.hasNoUnsignedWrap(); 4090b57cec5SDimitry Andric default: 4100b57cec5SDimitry Andric return false; 4110b57cec5SDimitry Andric } 4120b57cec5SDimitry Andric } 4130b57cec5SDimitry Andric 4140b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nuw set. 4150b57cec5SDimitry Andric static bool promotedOpIsNUW(const Instruction &I) { 4160b57cec5SDimitry Andric switch (I.getOpcode()) { 4170b57cec5SDimitry Andric case Instruction::Shl: 4180b57cec5SDimitry Andric case Instruction::Add: 4190b57cec5SDimitry Andric case Instruction::Mul: 4200b57cec5SDimitry Andric return true; 4210b57cec5SDimitry Andric case Instruction::Sub: 4220b57cec5SDimitry Andric return I.hasNoUnsignedWrap(); 4230b57cec5SDimitry Andric default: 4240b57cec5SDimitry Andric return false; 4250b57cec5SDimitry Andric } 4260b57cec5SDimitry Andric } 4270b57cec5SDimitry Andric 428*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { 4290b57cec5SDimitry Andric Type *Ty = I.getType(); 4300b57cec5SDimitry Andric const DataLayout &DL = Mod->getDataLayout(); 4310b57cec5SDimitry Andric int TySize = DL.getTypeSizeInBits(Ty); 4325ffd83dbSDimitry Andric Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); 4330b57cec5SDimitry Andric 434*06c3fb27SDimitry Andric return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I); 4350b57cec5SDimitry Andric } 4360b57cec5SDimitry Andric 437*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { 4380b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 4390b57cec5SDimitry Andric "I does not need promotion to i32"); 4400b57cec5SDimitry Andric 4410b57cec5SDimitry Andric if (I.getOpcode() == Instruction::SDiv || 4420b57cec5SDimitry Andric I.getOpcode() == Instruction::UDiv || 4430b57cec5SDimitry Andric I.getOpcode() == Instruction::SRem || 4440b57cec5SDimitry Andric I.getOpcode() == Instruction::URem) 4450b57cec5SDimitry Andric return false; 4460b57cec5SDimitry Andric 4470b57cec5SDimitry Andric IRBuilder<> Builder(&I); 4480b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 4490b57cec5SDimitry Andric 4500b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 4510b57cec5SDimitry Andric Value *ExtOp0 = nullptr; 4520b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 4530b57cec5SDimitry Andric Value *ExtRes = nullptr; 4540b57cec5SDimitry Andric Value *TruncRes = nullptr; 4550b57cec5SDimitry Andric 4560b57cec5SDimitry Andric if (isSigned(I)) { 4570b57cec5SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 4580b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 4590b57cec5SDimitry Andric } else { 4600b57cec5SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 4610b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 4620b57cec5SDimitry Andric } 4630b57cec5SDimitry Andric 4640b57cec5SDimitry Andric ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 4650b57cec5SDimitry Andric if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 4660b57cec5SDimitry Andric if (promotedOpIsNSW(cast<Instruction>(I))) 4670b57cec5SDimitry Andric Inst->setHasNoSignedWrap(); 4680b57cec5SDimitry Andric 4690b57cec5SDimitry Andric if (promotedOpIsNUW(cast<Instruction>(I))) 4700b57cec5SDimitry Andric Inst->setHasNoUnsignedWrap(); 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 4730b57cec5SDimitry Andric Inst->setIsExact(ExactOp->isExact()); 4740b57cec5SDimitry Andric } 4750b57cec5SDimitry Andric 4760b57cec5SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 4770b57cec5SDimitry Andric 4780b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 4790b57cec5SDimitry Andric I.eraseFromParent(); 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric return true; 4820b57cec5SDimitry Andric } 4830b57cec5SDimitry Andric 484*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { 4850b57cec5SDimitry Andric assert(needsPromotionToI32(I.getOperand(0)->getType()) && 4860b57cec5SDimitry Andric "I does not need promotion to i32"); 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric IRBuilder<> Builder(&I); 4890b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 4920b57cec5SDimitry Andric Value *ExtOp0 = nullptr; 4930b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 4940b57cec5SDimitry Andric Value *NewICmp = nullptr; 4950b57cec5SDimitry Andric 4960b57cec5SDimitry Andric if (I.isSigned()) { 4970b57cec5SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 4980b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 4990b57cec5SDimitry Andric } else { 5000b57cec5SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 5010b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 5020b57cec5SDimitry Andric } 5030b57cec5SDimitry Andric NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 5040b57cec5SDimitry Andric 5050b57cec5SDimitry Andric I.replaceAllUsesWith(NewICmp); 5060b57cec5SDimitry Andric I.eraseFromParent(); 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric return true; 5090b57cec5SDimitry Andric } 5100b57cec5SDimitry Andric 511*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { 5120b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 5130b57cec5SDimitry Andric "I does not need promotion to i32"); 5140b57cec5SDimitry Andric 5150b57cec5SDimitry Andric IRBuilder<> Builder(&I); 5160b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 5170b57cec5SDimitry Andric 5180b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 5190b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 5200b57cec5SDimitry Andric Value *ExtOp2 = nullptr; 5210b57cec5SDimitry Andric Value *ExtRes = nullptr; 5220b57cec5SDimitry Andric Value *TruncRes = nullptr; 5230b57cec5SDimitry Andric 5240b57cec5SDimitry Andric if (isSigned(I)) { 5250b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 5260b57cec5SDimitry Andric ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 5270b57cec5SDimitry Andric } else { 5280b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 5290b57cec5SDimitry Andric ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 5300b57cec5SDimitry Andric } 5310b57cec5SDimitry Andric ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 5320b57cec5SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 5330b57cec5SDimitry Andric 5340b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 5350b57cec5SDimitry Andric I.eraseFromParent(); 5360b57cec5SDimitry Andric 5370b57cec5SDimitry Andric return true; 5380b57cec5SDimitry Andric } 5390b57cec5SDimitry Andric 540*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( 5410b57cec5SDimitry Andric IntrinsicInst &I) const { 5420b57cec5SDimitry Andric assert(I.getIntrinsicID() == Intrinsic::bitreverse && 5430b57cec5SDimitry Andric "I must be bitreverse intrinsic"); 5440b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 5450b57cec5SDimitry Andric "I does not need promotion to i32"); 5460b57cec5SDimitry Andric 5470b57cec5SDimitry Andric IRBuilder<> Builder(&I); 5480b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 5490b57cec5SDimitry Andric 5500b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 5510b57cec5SDimitry Andric Function *I32 = 5520b57cec5SDimitry Andric Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 5530b57cec5SDimitry Andric Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 5540b57cec5SDimitry Andric Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 5550b57cec5SDimitry Andric Value *LShrOp = 5560b57cec5SDimitry Andric Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 5570b57cec5SDimitry Andric Value *TruncRes = 5580b57cec5SDimitry Andric Builder.CreateTrunc(LShrOp, I.getType()); 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 5610b57cec5SDimitry Andric I.eraseFromParent(); 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric return true; 5640b57cec5SDimitry Andric } 5650b57cec5SDimitry Andric 566*06c3fb27SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { 56704eeddc0SDimitry Andric return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits(); 5680b57cec5SDimitry Andric } 5690b57cec5SDimitry Andric 570*06c3fb27SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { 57104eeddc0SDimitry Andric return ComputeMaxSignificantBits(Op, *DL, 0, AC); 5720b57cec5SDimitry Andric } 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric static void extractValues(IRBuilder<> &Builder, 5750b57cec5SDimitry Andric SmallVectorImpl<Value *> &Values, Value *V) { 5765ffd83dbSDimitry Andric auto *VT = dyn_cast<FixedVectorType>(V->getType()); 5770b57cec5SDimitry Andric if (!VT) { 5780b57cec5SDimitry Andric Values.push_back(V); 5790b57cec5SDimitry Andric return; 5800b57cec5SDimitry Andric } 5810b57cec5SDimitry Andric 5820b57cec5SDimitry Andric for (int I = 0, E = VT->getNumElements(); I != E; ++I) 5830b57cec5SDimitry Andric Values.push_back(Builder.CreateExtractElement(V, I)); 5840b57cec5SDimitry Andric } 5850b57cec5SDimitry Andric 5860b57cec5SDimitry Andric static Value *insertValues(IRBuilder<> &Builder, 5870b57cec5SDimitry Andric Type *Ty, 5880b57cec5SDimitry Andric SmallVectorImpl<Value *> &Values) { 589bdd1243dSDimitry Andric if (!Ty->isVectorTy()) { 590bdd1243dSDimitry Andric assert(Values.size() == 1); 5910b57cec5SDimitry Andric return Values[0]; 592bdd1243dSDimitry Andric } 5930b57cec5SDimitry Andric 594bdd1243dSDimitry Andric Value *NewVal = PoisonValue::get(Ty); 5950b57cec5SDimitry Andric for (int I = 0, E = Values.size(); I != E; ++I) 5960b57cec5SDimitry Andric NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); 5970b57cec5SDimitry Andric 5980b57cec5SDimitry Andric return NewVal; 5990b57cec5SDimitry Andric } 6000b57cec5SDimitry Andric 601349cc55cSDimitry Andric // Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and 602349cc55cSDimitry Andric // `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the 603349cc55cSDimitry Andric // width of the original destination. 604349cc55cSDimitry Andric static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, 605349cc55cSDimitry Andric unsigned Size, unsigned NumBits, bool IsSigned) { 606349cc55cSDimitry Andric if (Size <= 32 || NumBits <= 32) { 607349cc55cSDimitry Andric Intrinsic::ID ID = 608349cc55cSDimitry Andric IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; 609349cc55cSDimitry Andric return Builder.CreateIntrinsic(ID, {}, {LHS, RHS}); 610349cc55cSDimitry Andric } 611349cc55cSDimitry Andric 612349cc55cSDimitry Andric assert(NumBits <= 48); 613349cc55cSDimitry Andric 614349cc55cSDimitry Andric Intrinsic::ID LoID = 615349cc55cSDimitry Andric IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; 616349cc55cSDimitry Andric Intrinsic::ID HiID = 617349cc55cSDimitry Andric IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24; 618349cc55cSDimitry Andric 619349cc55cSDimitry Andric Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS}); 620349cc55cSDimitry Andric Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS}); 621349cc55cSDimitry Andric 622349cc55cSDimitry Andric IntegerType *I64Ty = Builder.getInt64Ty(); 623349cc55cSDimitry Andric Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty); 624349cc55cSDimitry Andric Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty); 625349cc55cSDimitry Andric 626349cc55cSDimitry Andric return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); 627349cc55cSDimitry Andric } 628349cc55cSDimitry Andric 629*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { 6300b57cec5SDimitry Andric if (I.getOpcode() != Instruction::Mul) 6310b57cec5SDimitry Andric return false; 6320b57cec5SDimitry Andric 6330b57cec5SDimitry Andric Type *Ty = I.getType(); 6340b57cec5SDimitry Andric unsigned Size = Ty->getScalarSizeInBits(); 6350b57cec5SDimitry Andric if (Size <= 16 && ST->has16BitInsts()) 6360b57cec5SDimitry Andric return false; 6370b57cec5SDimitry Andric 6380b57cec5SDimitry Andric // Prefer scalar if this could be s_mul_i32 639*06c3fb27SDimitry Andric if (UA->isUniform(&I)) 6400b57cec5SDimitry Andric return false; 6410b57cec5SDimitry Andric 6420b57cec5SDimitry Andric Value *LHS = I.getOperand(0); 6430b57cec5SDimitry Andric Value *RHS = I.getOperand(1); 6440b57cec5SDimitry Andric IRBuilder<> Builder(&I); 6450b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 6460b57cec5SDimitry Andric 647349cc55cSDimitry Andric unsigned LHSBits = 0, RHSBits = 0; 648349cc55cSDimitry Andric bool IsSigned = false; 6490b57cec5SDimitry Andric 65004eeddc0SDimitry Andric if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && 65104eeddc0SDimitry Andric (RHSBits = numBitsUnsigned(RHS)) <= 24) { 652349cc55cSDimitry Andric IsSigned = false; 653349cc55cSDimitry Andric 65404eeddc0SDimitry Andric } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && 65504eeddc0SDimitry Andric (RHSBits = numBitsSigned(RHS)) <= 24) { 656349cc55cSDimitry Andric IsSigned = true; 657349cc55cSDimitry Andric 6580b57cec5SDimitry Andric } else 6590b57cec5SDimitry Andric return false; 6600b57cec5SDimitry Andric 6610b57cec5SDimitry Andric SmallVector<Value *, 4> LHSVals; 6620b57cec5SDimitry Andric SmallVector<Value *, 4> RHSVals; 6630b57cec5SDimitry Andric SmallVector<Value *, 4> ResultVals; 6640b57cec5SDimitry Andric extractValues(Builder, LHSVals, LHS); 6650b57cec5SDimitry Andric extractValues(Builder, RHSVals, RHS); 6660b57cec5SDimitry Andric 6670b57cec5SDimitry Andric IntegerType *I32Ty = Builder.getInt32Ty(); 6680b57cec5SDimitry Andric for (int I = 0, E = LHSVals.size(); I != E; ++I) { 6690b57cec5SDimitry Andric Value *LHS, *RHS; 670349cc55cSDimitry Andric if (IsSigned) { 6710b57cec5SDimitry Andric LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); 6720b57cec5SDimitry Andric RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); 673349cc55cSDimitry Andric } else { 674349cc55cSDimitry Andric LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); 675349cc55cSDimitry Andric RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); 6760b57cec5SDimitry Andric } 6770b57cec5SDimitry Andric 678349cc55cSDimitry Andric Value *Result = 679349cc55cSDimitry Andric getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned); 6800b57cec5SDimitry Andric 681349cc55cSDimitry Andric if (IsSigned) { 682349cc55cSDimitry Andric ResultVals.push_back( 683349cc55cSDimitry Andric Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType())); 6840b57cec5SDimitry Andric } else { 685349cc55cSDimitry Andric ResultVals.push_back( 686349cc55cSDimitry Andric Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); 6870b57cec5SDimitry Andric } 6880b57cec5SDimitry Andric } 6890b57cec5SDimitry Andric 6908bcb0991SDimitry Andric Value *NewVal = insertValues(Builder, Ty, ResultVals); 6918bcb0991SDimitry Andric NewVal->takeName(&I); 6928bcb0991SDimitry Andric I.replaceAllUsesWith(NewVal); 6930b57cec5SDimitry Andric I.eraseFromParent(); 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric return true; 6960b57cec5SDimitry Andric } 6970b57cec5SDimitry Andric 6985ffd83dbSDimitry Andric // Find a select instruction, which may have been casted. This is mostly to deal 6995ffd83dbSDimitry Andric // with cases where i16 selects were promoted here to i32. 7005ffd83dbSDimitry Andric static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { 7015ffd83dbSDimitry Andric Cast = nullptr; 7025ffd83dbSDimitry Andric if (SelectInst *Sel = dyn_cast<SelectInst>(V)) 7035ffd83dbSDimitry Andric return Sel; 7040b57cec5SDimitry Andric 7055ffd83dbSDimitry Andric if ((Cast = dyn_cast<CastInst>(V))) { 7065ffd83dbSDimitry Andric if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) 7075ffd83dbSDimitry Andric return Sel; 7080b57cec5SDimitry Andric } 7090b57cec5SDimitry Andric 7105ffd83dbSDimitry Andric return nullptr; 7115ffd83dbSDimitry Andric } 7120b57cec5SDimitry Andric 713*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { 7145ffd83dbSDimitry Andric // Don't do this unless the old select is going away. We want to eliminate the 7155ffd83dbSDimitry Andric // binary operator, not replace a binop with a select. 7165ffd83dbSDimitry Andric int SelOpNo = 0; 7175ffd83dbSDimitry Andric 7185ffd83dbSDimitry Andric CastInst *CastOp; 7195ffd83dbSDimitry Andric 7205ffd83dbSDimitry Andric // TODO: Should probably try to handle some cases with multiple 7215ffd83dbSDimitry Andric // users. Duplicating the select may be profitable for division. 7225ffd83dbSDimitry Andric SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); 7235ffd83dbSDimitry Andric if (!Sel || !Sel->hasOneUse()) { 7245ffd83dbSDimitry Andric SelOpNo = 1; 7255ffd83dbSDimitry Andric Sel = findSelectThroughCast(BO.getOperand(1), CastOp); 7265ffd83dbSDimitry Andric } 7275ffd83dbSDimitry Andric 7285ffd83dbSDimitry Andric if (!Sel || !Sel->hasOneUse()) 7290b57cec5SDimitry Andric return false; 7300b57cec5SDimitry Andric 7315ffd83dbSDimitry Andric Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); 7325ffd83dbSDimitry Andric Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); 7335ffd83dbSDimitry Andric Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1)); 7345ffd83dbSDimitry Andric if (!CBO || !CT || !CF) 7355ffd83dbSDimitry Andric return false; 7365ffd83dbSDimitry Andric 7375ffd83dbSDimitry Andric if (CastOp) { 7385ffd83dbSDimitry Andric if (!CastOp->hasOneUse()) 7395ffd83dbSDimitry Andric return false; 7405ffd83dbSDimitry Andric CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); 7415ffd83dbSDimitry Andric CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); 7425ffd83dbSDimitry Andric } 7435ffd83dbSDimitry Andric 7445ffd83dbSDimitry Andric // TODO: Handle special 0/-1 cases DAG combine does, although we only really 7455ffd83dbSDimitry Andric // need to handle divisions here. 7465ffd83dbSDimitry Andric Constant *FoldedT = SelOpNo ? 7475ffd83dbSDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : 7485ffd83dbSDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); 749753f127fSDimitry Andric if (!FoldedT || isa<ConstantExpr>(FoldedT)) 7505ffd83dbSDimitry Andric return false; 7515ffd83dbSDimitry Andric 7525ffd83dbSDimitry Andric Constant *FoldedF = SelOpNo ? 7535ffd83dbSDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : 7545ffd83dbSDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); 755753f127fSDimitry Andric if (!FoldedF || isa<ConstantExpr>(FoldedF)) 7565ffd83dbSDimitry Andric return false; 7575ffd83dbSDimitry Andric 7585ffd83dbSDimitry Andric IRBuilder<> Builder(&BO); 7595ffd83dbSDimitry Andric Builder.SetCurrentDebugLocation(BO.getDebugLoc()); 7605ffd83dbSDimitry Andric if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO)) 7615ffd83dbSDimitry Andric Builder.setFastMathFlags(FPOp->getFastMathFlags()); 7625ffd83dbSDimitry Andric 7635ffd83dbSDimitry Andric Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), 7645ffd83dbSDimitry Andric FoldedT, FoldedF); 7655ffd83dbSDimitry Andric NewSelect->takeName(&BO); 7665ffd83dbSDimitry Andric BO.replaceAllUsesWith(NewSelect); 7675ffd83dbSDimitry Andric BO.eraseFromParent(); 7685ffd83dbSDimitry Andric if (CastOp) 7695ffd83dbSDimitry Andric CastOp->eraseFromParent(); 7705ffd83dbSDimitry Andric Sel->eraseFromParent(); 7715ffd83dbSDimitry Andric return true; 7725ffd83dbSDimitry Andric } 7735ffd83dbSDimitry Andric 774*06c3fb27SDimitry Andric std::pair<Value *, Value *> 775*06c3fb27SDimitry Andric AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder, 776*06c3fb27SDimitry Andric Value *Src) const { 777*06c3fb27SDimitry Andric Type *Ty = Src->getType(); 778*06c3fb27SDimitry Andric Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, 779*06c3fb27SDimitry Andric {Ty, Builder.getInt32Ty()}, Src); 780*06c3fb27SDimitry Andric Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); 781*06c3fb27SDimitry Andric 782*06c3fb27SDimitry Andric // Bypass the bug workaround for the exponent result since it doesn't matter. 783*06c3fb27SDimitry Andric // TODO: Does the bug workaround even really need to consider the exponent 784*06c3fb27SDimitry Andric // result? It's unspecified by the spec. 785*06c3fb27SDimitry Andric 786*06c3fb27SDimitry Andric Value *FrexpExp = 787*06c3fb27SDimitry Andric ST->hasFractBug() 788*06c3fb27SDimitry Andric ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, 789*06c3fb27SDimitry Andric {Builder.getInt32Ty(), Ty}, Src) 790*06c3fb27SDimitry Andric : Builder.CreateExtractValue(Frexp, {1}); 791*06c3fb27SDimitry Andric return {FrexpMant, FrexpExp}; 792*06c3fb27SDimitry Andric } 793*06c3fb27SDimitry Andric 794*06c3fb27SDimitry Andric /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals. 795*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder, 796*06c3fb27SDimitry Andric Value *Src, 797*06c3fb27SDimitry Andric bool IsNegative) const { 798*06c3fb27SDimitry Andric // Same as for 1.0, but expand the sign out of the constant. 799*06c3fb27SDimitry Andric // -1.0 / x -> rcp (fneg x) 800*06c3fb27SDimitry Andric if (IsNegative) 801*06c3fb27SDimitry Andric Src = Builder.CreateFNeg(Src); 802*06c3fb27SDimitry Andric 803*06c3fb27SDimitry Andric // The rcp instruction doesn't support denormals, so scale the input 804*06c3fb27SDimitry Andric // out of the denormal range and convert at the end. 805*06c3fb27SDimitry Andric // 806*06c3fb27SDimitry Andric // Expand as 2^-n * (1.0 / (x * 2^n)) 807*06c3fb27SDimitry Andric 808*06c3fb27SDimitry Andric // TODO: Skip scaling if input is known never denormal and the input 809*06c3fb27SDimitry Andric // range won't underflow to denormal. The hard part is knowing the 810*06c3fb27SDimitry Andric // result. We need a range check, the result could be denormal for 811*06c3fb27SDimitry Andric // 0x1p+126 < den <= 0x1p+127. 812*06c3fb27SDimitry Andric 813*06c3fb27SDimitry Andric Type *Ty = Src->getType(); 814*06c3fb27SDimitry Andric 815*06c3fb27SDimitry Andric auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); 816*06c3fb27SDimitry Andric Value *ScaleFactor = Builder.CreateNeg(FrexpExp); 817*06c3fb27SDimitry Andric Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); 818*06c3fb27SDimitry Andric return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, 819*06c3fb27SDimitry Andric {Rcp, ScaleFactor}); 820*06c3fb27SDimitry Andric } 821*06c3fb27SDimitry Andric 822*06c3fb27SDimitry Andric /// Emit a 2ulp expansion for fdiv by using frexp for input scaling. 823*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, 824*06c3fb27SDimitry Andric Value *RHS, 825*06c3fb27SDimitry Andric FastMathFlags FMF) const { 826*06c3fb27SDimitry Andric // If we have have to work around the fract/frexp bug, we're worse off than 827*06c3fb27SDimitry Andric // using the fdiv.fast expansion. The full safe expansion is faster if we have 828*06c3fb27SDimitry Andric // fast FMA. 829*06c3fb27SDimitry Andric if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() && 830*06c3fb27SDimitry Andric (!FMF.noNaNs() || !FMF.noInfs())) 831*06c3fb27SDimitry Andric return nullptr; 832*06c3fb27SDimitry Andric 833*06c3fb27SDimitry Andric // We're scaling the LHS to avoid a denormal input, and scale the denominator 834*06c3fb27SDimitry Andric // to avoid large values underflowing the result. 835*06c3fb27SDimitry Andric Type *Ty = LHS->getType(); 836*06c3fb27SDimitry Andric 837*06c3fb27SDimitry Andric auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); 838*06c3fb27SDimitry Andric 839*06c3fb27SDimitry Andric Value *Rcp = 840*06c3fb27SDimitry Andric Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS); 841*06c3fb27SDimitry Andric 842*06c3fb27SDimitry Andric auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS); 843*06c3fb27SDimitry Andric Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp); 844*06c3fb27SDimitry Andric 845*06c3fb27SDimitry Andric // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the 846*06c3fb27SDimitry Andric // result. 847*06c3fb27SDimitry Andric Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); 848*06c3fb27SDimitry Andric return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, 849*06c3fb27SDimitry Andric {Mul, ExpDiff}); 850*06c3fb27SDimitry Andric } 851*06c3fb27SDimitry Andric 852*06c3fb27SDimitry Andric /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. 853*06c3fb27SDimitry Andric static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, 854*06c3fb27SDimitry Andric bool IsNegative) { 855*06c3fb27SDimitry Andric // bool need_scale = x < 0x1p-126f; 856*06c3fb27SDimitry Andric // float input_scale = need_scale ? 0x1.0p+24f : 1.0f; 857*06c3fb27SDimitry Andric // float output_scale = need_scale ? 0x1.0p+12f : 1.0f; 858*06c3fb27SDimitry Andric // rsq(x * input_scale) * output_scale; 859*06c3fb27SDimitry Andric 860*06c3fb27SDimitry Andric Type *Ty = Src->getType(); 861*06c3fb27SDimitry Andric APFloat SmallestNormal = 862*06c3fb27SDimitry Andric APFloat::getSmallestNormalized(Ty->getFltSemantics()); 863*06c3fb27SDimitry Andric Value *NeedScale = 864*06c3fb27SDimitry Andric Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); 865*06c3fb27SDimitry Andric Constant *One = ConstantFP::get(Ty, 1.0); 866*06c3fb27SDimitry Andric Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24); 867*06c3fb27SDimitry Andric Constant *OutputScale = 868*06c3fb27SDimitry Andric ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12); 869*06c3fb27SDimitry Andric 870*06c3fb27SDimitry Andric Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One); 871*06c3fb27SDimitry Andric 872*06c3fb27SDimitry Andric Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor); 873*06c3fb27SDimitry Andric Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput); 874*06c3fb27SDimitry Andric Value *OutputScaleFactor = Builder.CreateSelect( 875*06c3fb27SDimitry Andric NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One); 876*06c3fb27SDimitry Andric 877*06c3fb27SDimitry Andric return Builder.CreateFMul(Rsq, OutputScaleFactor); 878*06c3fb27SDimitry Andric } 879*06c3fb27SDimitry Andric 880*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, 881*06c3fb27SDimitry Andric FastMathFlags DivFMF, 882*06c3fb27SDimitry Andric FastMathFlags SqrtFMF) const { 883*06c3fb27SDimitry Andric // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. 884*06c3fb27SDimitry Andric if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) 885*06c3fb27SDimitry Andric return false; 886*06c3fb27SDimitry Andric 887*06c3fb27SDimitry Andric // v_rsq_f32 gives 1ulp 888*06c3fb27SDimitry Andric return SqrtFMF.approxFunc() || HasUnsafeFPMath || 889*06c3fb27SDimitry Andric SqrtOp->getFPAccuracy() >= 1.0f; 890*06c3fb27SDimitry Andric } 891*06c3fb27SDimitry Andric 892*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( 893*06c3fb27SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, 894*06c3fb27SDimitry Andric FastMathFlags SqrtFMF, const Instruction *CtxI) const { 895*06c3fb27SDimitry Andric // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. 896*06c3fb27SDimitry Andric assert(DivFMF.allowContract() && SqrtFMF.allowContract()); 897*06c3fb27SDimitry Andric 898*06c3fb27SDimitry Andric // rsq_f16 is accurate to 0.51 ulp. 899*06c3fb27SDimitry Andric // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. 900*06c3fb27SDimitry Andric // rsq_f64 is never accurate. 901*06c3fb27SDimitry Andric const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num); 902*06c3fb27SDimitry Andric if (!CLHS) 903*06c3fb27SDimitry Andric return nullptr; 904*06c3fb27SDimitry Andric 905*06c3fb27SDimitry Andric assert(Den->getType()->isFloatTy()); 906*06c3fb27SDimitry Andric 907*06c3fb27SDimitry Andric bool IsNegative = false; 908*06c3fb27SDimitry Andric 909*06c3fb27SDimitry Andric // TODO: Handle other numerator values with arcp. 910*06c3fb27SDimitry Andric if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { 911*06c3fb27SDimitry Andric // Add in the sqrt flags. 912*06c3fb27SDimitry Andric IRBuilder<>::FastMathFlagGuard Guard(Builder); 913*06c3fb27SDimitry Andric DivFMF |= SqrtFMF; 914*06c3fb27SDimitry Andric Builder.setFastMathFlags(DivFMF); 915*06c3fb27SDimitry Andric 916*06c3fb27SDimitry Andric if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || 917*06c3fb27SDimitry Andric canIgnoreDenormalInput(Den, CtxI)) { 918*06c3fb27SDimitry Andric Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); 919*06c3fb27SDimitry Andric // -1.0 / sqrt(x) -> fneg(rsq(x)) 920*06c3fb27SDimitry Andric return IsNegative ? Builder.CreateFNeg(Result) : Result; 921*06c3fb27SDimitry Andric } 922*06c3fb27SDimitry Andric 923*06c3fb27SDimitry Andric return emitRsqIEEE1ULP(Builder, Den, IsNegative); 924*06c3fb27SDimitry Andric } 925*06c3fb27SDimitry Andric 926*06c3fb27SDimitry Andric return nullptr; 927*06c3fb27SDimitry Andric } 928*06c3fb27SDimitry Andric 9295ffd83dbSDimitry Andric // Optimize fdiv with rcp: 9305ffd83dbSDimitry Andric // 9315ffd83dbSDimitry Andric // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 9325ffd83dbSDimitry Andric // allowed with unsafe-fp-math or afn. 9335ffd83dbSDimitry Andric // 934*06c3fb27SDimitry Andric // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 935*06c3fb27SDimitry Andric Value * 936*06c3fb27SDimitry Andric AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num, 937*06c3fb27SDimitry Andric Value *Den, FastMathFlags FMF, 938*06c3fb27SDimitry Andric const Instruction *CtxI) const { 939*06c3fb27SDimitry Andric // rcp_f16 is accurate to 0.51 ulp. 940*06c3fb27SDimitry Andric // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. 941*06c3fb27SDimitry Andric // rcp_f64 is never accurate. 942*06c3fb27SDimitry Andric assert(Den->getType()->isFloatTy()); 9435ffd83dbSDimitry Andric 9445ffd83dbSDimitry Andric if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { 945*06c3fb27SDimitry Andric bool IsNegative = false; 946*06c3fb27SDimitry Andric if (CLHS->isExactlyValue(1.0) || 947*06c3fb27SDimitry Andric (IsNegative = CLHS->isExactlyValue(-1.0))) { 948*06c3fb27SDimitry Andric Value *Src = Den; 949*06c3fb27SDimitry Andric 950*06c3fb27SDimitry Andric if (HasFP32DenormalFlush || FMF.approxFunc()) { 951*06c3fb27SDimitry Andric // -1.0 / x -> 1.0 / fneg(x) 952*06c3fb27SDimitry Andric if (IsNegative) 953*06c3fb27SDimitry Andric Src = Builder.CreateFNeg(Src); 9545ffd83dbSDimitry Andric 9555ffd83dbSDimitry Andric // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 9565ffd83dbSDimitry Andric // the CI documentation has a worst case error of 1 ulp. 957*06c3fb27SDimitry Andric // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK 958*06c3fb27SDimitry Andric // to use it as long as we aren't trying to use denormals. 9595ffd83dbSDimitry Andric // 9605ffd83dbSDimitry Andric // v_rcp_f16 and v_rsq_f16 DO support denormals. 9615ffd83dbSDimitry Andric 9625ffd83dbSDimitry Andric // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't 9635ffd83dbSDimitry Andric // insert rsq intrinsic here. 9645ffd83dbSDimitry Andric 9655ffd83dbSDimitry Andric // 1.0 / x -> rcp(x) 966*06c3fb27SDimitry Andric return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src); 9675ffd83dbSDimitry Andric } 9685ffd83dbSDimitry Andric 969*06c3fb27SDimitry Andric // TODO: If the input isn't denormal, and we know the input exponent isn't 970*06c3fb27SDimitry Andric // big enough to introduce a denormal we can avoid the scaling. 971*06c3fb27SDimitry Andric return emitRcpIEEE1ULP(Builder, Src, IsNegative); 9725ffd83dbSDimitry Andric } 9735ffd83dbSDimitry Andric } 9745ffd83dbSDimitry Andric 975*06c3fb27SDimitry Andric if (FMF.allowReciprocal()) { 9765ffd83dbSDimitry Andric // x / y -> x * (1.0 / y) 977*06c3fb27SDimitry Andric 978*06c3fb27SDimitry Andric // TODO: Could avoid denormal scaling and use raw rcp if we knew the output 979*06c3fb27SDimitry Andric // will never underflow. 980*06c3fb27SDimitry Andric if (HasFP32DenormalFlush || FMF.approxFunc()) { 981*06c3fb27SDimitry Andric Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den); 9825ffd83dbSDimitry Andric return Builder.CreateFMul(Num, Recip); 9835ffd83dbSDimitry Andric } 984*06c3fb27SDimitry Andric 985*06c3fb27SDimitry Andric Value *Recip = emitRcpIEEE1ULP(Builder, Den, false); 986*06c3fb27SDimitry Andric return Builder.CreateFMul(Num, Recip); 987*06c3fb27SDimitry Andric } 988*06c3fb27SDimitry Andric 9895ffd83dbSDimitry Andric return nullptr; 9905ffd83dbSDimitry Andric } 9915ffd83dbSDimitry Andric 9925ffd83dbSDimitry Andric // optimize with fdiv.fast: 9935ffd83dbSDimitry Andric // 9945ffd83dbSDimitry Andric // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 9955ffd83dbSDimitry Andric // 9965ffd83dbSDimitry Andric // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 9975ffd83dbSDimitry Andric // 9985ffd83dbSDimitry Andric // NOTE: optimizeWithRcp should be tried first because rcp is the preference. 999*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( 1000*06c3fb27SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const { 10015ffd83dbSDimitry Andric // fdiv.fast can achieve 2.5 ULP accuracy. 10025ffd83dbSDimitry Andric if (ReqdAccuracy < 2.5f) 10035ffd83dbSDimitry Andric return nullptr; 10045ffd83dbSDimitry Andric 10055ffd83dbSDimitry Andric // Only have fdiv.fast for f32. 1006*06c3fb27SDimitry Andric assert(Den->getType()->isFloatTy()); 10075ffd83dbSDimitry Andric 10085ffd83dbSDimitry Andric bool NumIsOne = false; 10095ffd83dbSDimitry Andric if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { 10105ffd83dbSDimitry Andric if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) 10115ffd83dbSDimitry Andric NumIsOne = true; 10125ffd83dbSDimitry Andric } 10135ffd83dbSDimitry Andric 10145ffd83dbSDimitry Andric // fdiv does not support denormals. But 1.0/x is always fine to use it. 1015*06c3fb27SDimitry Andric // 1016*06c3fb27SDimitry Andric // TODO: This works for any value with a specific known exponent range, don't 1017*06c3fb27SDimitry Andric // just limit to constant 1. 1018*06c3fb27SDimitry Andric if (!HasFP32DenormalFlush && !NumIsOne) 10195ffd83dbSDimitry Andric return nullptr; 10205ffd83dbSDimitry Andric 1021*06c3fb27SDimitry Andric return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den}); 1022*06c3fb27SDimitry Andric } 1023*06c3fb27SDimitry Andric 1024*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( 1025*06c3fb27SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, 1026*06c3fb27SDimitry Andric FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst, 1027*06c3fb27SDimitry Andric float ReqdDivAccuracy) const { 1028*06c3fb27SDimitry Andric if (RsqOp) { 1029*06c3fb27SDimitry Andric Value *Rsq = 1030*06c3fb27SDimitry Andric optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst); 1031*06c3fb27SDimitry Andric if (Rsq) 1032*06c3fb27SDimitry Andric return Rsq; 1033*06c3fb27SDimitry Andric } 1034*06c3fb27SDimitry Andric 1035*06c3fb27SDimitry Andric Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst); 1036*06c3fb27SDimitry Andric if (Rcp) 1037*06c3fb27SDimitry Andric return Rcp; 1038*06c3fb27SDimitry Andric 1039*06c3fb27SDimitry Andric // In the basic case fdiv_fast has the same instruction count as the frexp div 1040*06c3fb27SDimitry Andric // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can 1041*06c3fb27SDimitry Andric // potentially be fused into a user. Also, materialization of the constants 1042*06c3fb27SDimitry Andric // can be reused for multiple instances. 1043*06c3fb27SDimitry Andric Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); 1044*06c3fb27SDimitry Andric if (FDivFast) 1045*06c3fb27SDimitry Andric return FDivFast; 1046*06c3fb27SDimitry Andric 1047*06c3fb27SDimitry Andric return emitFrexpDiv(Builder, Num, Den, DivFMF); 10485ffd83dbSDimitry Andric } 10495ffd83dbSDimitry Andric 10505ffd83dbSDimitry Andric // Optimizations is performed based on fpmath, fast math flags as well as 10515ffd83dbSDimitry Andric // denormals to optimize fdiv with either rcp or fdiv.fast. 10525ffd83dbSDimitry Andric // 10535ffd83dbSDimitry Andric // With rcp: 10545ffd83dbSDimitry Andric // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 10555ffd83dbSDimitry Andric // allowed with unsafe-fp-math or afn. 10565ffd83dbSDimitry Andric // 10575ffd83dbSDimitry Andric // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. 10585ffd83dbSDimitry Andric // 10595ffd83dbSDimitry Andric // With fdiv.fast: 10605ffd83dbSDimitry Andric // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 10615ffd83dbSDimitry Andric // 10625ffd83dbSDimitry Andric // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 10635ffd83dbSDimitry Andric // 10645ffd83dbSDimitry Andric // NOTE: rcp is the preference in cases that both are legal. 1065*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { 1066*06c3fb27SDimitry Andric if (DisableFDivExpand) 1067*06c3fb27SDimitry Andric return false; 10685ffd83dbSDimitry Andric 10695ffd83dbSDimitry Andric Type *Ty = FDiv.getType()->getScalarType(); 1070*06c3fb27SDimitry Andric if (!Ty->isFloatTy()) 1071*06c3fb27SDimitry Andric return false; 10725ffd83dbSDimitry Andric 1073e8d8bef9SDimitry Andric // The f64 rcp/rsq approximations are pretty inaccurate. We can do an 1074*06c3fb27SDimitry Andric // expansion around them in codegen. f16 is good enough to always use. 10750b57cec5SDimitry Andric 10760b57cec5SDimitry Andric const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 1077*06c3fb27SDimitry Andric const FastMathFlags DivFMF = FPOp->getFastMathFlags(); 10785ffd83dbSDimitry Andric const float ReqdAccuracy = FPOp->getFPAccuracy(); 10790b57cec5SDimitry Andric 10805ffd83dbSDimitry Andric // Inaccurate rcp is allowed with unsafe-fp-math or afn. 1081*06c3fb27SDimitry Andric // 1082*06c3fb27SDimitry Andric // Defer to codegen to handle this. 1083*06c3fb27SDimitry Andric // 1084*06c3fb27SDimitry Andric // TODO: Decide on an interpretation for interactions between afn + arcp + 1085*06c3fb27SDimitry Andric // !fpmath, and make it consistent between here and codegen. For now, defer 1086*06c3fb27SDimitry Andric // expansion of afn to codegen. The current interpretation is so aggressive we 1087*06c3fb27SDimitry Andric // don't need any pre-consideration here when we have better information. A 1088*06c3fb27SDimitry Andric // more conservative interpretation could use handling here. 1089*06c3fb27SDimitry Andric const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); 1090*06c3fb27SDimitry Andric if (AllowInaccurateRcp) 1091*06c3fb27SDimitry Andric return false; 10920b57cec5SDimitry Andric 1093*06c3fb27SDimitry Andric // Defer the correct implementations to codegen. 1094*06c3fb27SDimitry Andric if (ReqdAccuracy < 1.0f) 1095*06c3fb27SDimitry Andric return false; 10960b57cec5SDimitry Andric 1097*06c3fb27SDimitry Andric FastMathFlags SqrtFMF; 10980b57cec5SDimitry Andric 10990b57cec5SDimitry Andric Value *Num = FDiv.getOperand(0); 11000b57cec5SDimitry Andric Value *Den = FDiv.getOperand(1); 11010b57cec5SDimitry Andric 1102*06c3fb27SDimitry Andric Value *RsqOp = nullptr; 1103*06c3fb27SDimitry Andric auto *DenII = dyn_cast<IntrinsicInst>(Den); 1104*06c3fb27SDimitry Andric if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && 1105*06c3fb27SDimitry Andric DenII->hasOneUse()) { 1106*06c3fb27SDimitry Andric const auto *SqrtOp = cast<FPMathOperator>(DenII); 1107*06c3fb27SDimitry Andric SqrtFMF = SqrtOp->getFastMathFlags(); 1108*06c3fb27SDimitry Andric if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) 1109*06c3fb27SDimitry Andric RsqOp = SqrtOp->getOperand(0); 11100b57cec5SDimitry Andric } 11110b57cec5SDimitry Andric 1112*06c3fb27SDimitry Andric IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); 1113*06c3fb27SDimitry Andric Builder.setFastMathFlags(DivFMF); 1114*06c3fb27SDimitry Andric Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 1115*06c3fb27SDimitry Andric 1116*06c3fb27SDimitry Andric SmallVector<Value *, 4> NumVals; 1117*06c3fb27SDimitry Andric SmallVector<Value *, 4> DenVals; 1118*06c3fb27SDimitry Andric SmallVector<Value *, 4> RsqDenVals; 1119*06c3fb27SDimitry Andric extractValues(Builder, NumVals, Num); 1120*06c3fb27SDimitry Andric extractValues(Builder, DenVals, Den); 1121*06c3fb27SDimitry Andric 1122*06c3fb27SDimitry Andric if (RsqOp) 1123*06c3fb27SDimitry Andric extractValues(Builder, RsqDenVals, RsqOp); 1124*06c3fb27SDimitry Andric 1125*06c3fb27SDimitry Andric SmallVector<Value *, 4> ResultVals(NumVals.size()); 1126*06c3fb27SDimitry Andric for (int I = 0, E = NumVals.size(); I != E; ++I) { 1127*06c3fb27SDimitry Andric Value *NumElt = NumVals[I]; 1128*06c3fb27SDimitry Andric Value *DenElt = DenVals[I]; 1129*06c3fb27SDimitry Andric Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr; 1130*06c3fb27SDimitry Andric 1131*06c3fb27SDimitry Andric Value *NewElt = 1132*06c3fb27SDimitry Andric visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt, 1133*06c3fb27SDimitry Andric cast<Instruction>(FPOp), ReqdAccuracy); 1134*06c3fb27SDimitry Andric if (!NewElt) { 1135*06c3fb27SDimitry Andric // Keep the original, but scalarized. 1136*06c3fb27SDimitry Andric 1137*06c3fb27SDimitry Andric // This has the unfortunate side effect of sometimes scalarizing when 1138*06c3fb27SDimitry Andric // we're not going to do anything. 1139*06c3fb27SDimitry Andric NewElt = Builder.CreateFDiv(NumElt, DenElt); 1140*06c3fb27SDimitry Andric if (auto *NewEltInst = dyn_cast<Instruction>(NewElt)) 1141*06c3fb27SDimitry Andric NewEltInst->copyMetadata(FDiv); 11420b57cec5SDimitry Andric } 11430b57cec5SDimitry Andric 1144*06c3fb27SDimitry Andric ResultVals[I] = NewElt; 11450b57cec5SDimitry Andric } 11460b57cec5SDimitry Andric 1147*06c3fb27SDimitry Andric Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals); 1148fe6060f1SDimitry Andric 1149*06c3fb27SDimitry Andric if (NewVal) { 1150*06c3fb27SDimitry Andric FDiv.replaceAllUsesWith(NewVal); 1151*06c3fb27SDimitry Andric NewVal->takeName(&FDiv); 1152*06c3fb27SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo); 1153*06c3fb27SDimitry Andric } 1154fe6060f1SDimitry Andric 1155fe6060f1SDimitry Andric return true; 1156fe6060f1SDimitry Andric } 1157fe6060f1SDimitry Andric 11580b57cec5SDimitry Andric static bool hasUnsafeFPMath(const Function &F) { 11590b57cec5SDimitry Andric Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 1160fe6060f1SDimitry Andric return Attr.getValueAsBool(); 11610b57cec5SDimitry Andric } 11620b57cec5SDimitry Andric 11630b57cec5SDimitry Andric static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 11640b57cec5SDimitry Andric Value *LHS, Value *RHS) { 11650b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 11660b57cec5SDimitry Andric Type *I64Ty = Builder.getInt64Ty(); 11670b57cec5SDimitry Andric 11680b57cec5SDimitry Andric Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 11690b57cec5SDimitry Andric Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 11700b57cec5SDimitry Andric Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 11710b57cec5SDimitry Andric Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 11720b57cec5SDimitry Andric Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 11730b57cec5SDimitry Andric Hi = Builder.CreateTrunc(Hi, I32Ty); 1174bdd1243dSDimitry Andric return std::pair(Lo, Hi); 11750b57cec5SDimitry Andric } 11760b57cec5SDimitry Andric 11770b57cec5SDimitry Andric static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 11780b57cec5SDimitry Andric return getMul64(Builder, LHS, RHS).second; 11790b57cec5SDimitry Andric } 11800b57cec5SDimitry Andric 118181ad6265SDimitry Andric /// Figure out how many bits are really needed for this division. \p AtLeast is 11825ffd83dbSDimitry Andric /// an optimization hint to bypass the second ComputeNumSignBits call if we the 11835ffd83dbSDimitry Andric /// first one is insufficient. Returns -1 on failure. 1184*06c3fb27SDimitry Andric int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, 1185*06c3fb27SDimitry Andric Value *Den, unsigned AtLeast, 1186*06c3fb27SDimitry Andric bool IsSigned) const { 11875ffd83dbSDimitry Andric const DataLayout &DL = Mod->getDataLayout(); 11885ffd83dbSDimitry Andric unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 11895ffd83dbSDimitry Andric if (LHSSignBits < AtLeast) 11905ffd83dbSDimitry Andric return -1; 11915ffd83dbSDimitry Andric 11925ffd83dbSDimitry Andric unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 11935ffd83dbSDimitry Andric if (RHSSignBits < AtLeast) 11945ffd83dbSDimitry Andric return -1; 11955ffd83dbSDimitry Andric 11965ffd83dbSDimitry Andric unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 11975ffd83dbSDimitry Andric unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; 11985ffd83dbSDimitry Andric if (IsSigned) 11995ffd83dbSDimitry Andric ++DivBits; 12005ffd83dbSDimitry Andric return DivBits; 12015ffd83dbSDimitry Andric } 12025ffd83dbSDimitry Andric 12030b57cec5SDimitry Andric // The fractional part of a float is enough to accurately represent up to 12040b57cec5SDimitry Andric // a 24-bit signed integer. 1205*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder, 1206*06c3fb27SDimitry Andric BinaryOperator &I, Value *Num, 1207*06c3fb27SDimitry Andric Value *Den, bool IsDiv, 1208*06c3fb27SDimitry Andric bool IsSigned) const { 12095ffd83dbSDimitry Andric int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); 12105ffd83dbSDimitry Andric if (DivBits == -1) 12110b57cec5SDimitry Andric return nullptr; 12125ffd83dbSDimitry Andric return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); 12135ffd83dbSDimitry Andric } 12140b57cec5SDimitry Andric 1215*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( 1216*06c3fb27SDimitry Andric IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, 1217*06c3fb27SDimitry Andric unsigned DivBits, bool IsDiv, bool IsSigned) const { 12180b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 12195ffd83dbSDimitry Andric Num = Builder.CreateTrunc(Num, I32Ty); 12205ffd83dbSDimitry Andric Den = Builder.CreateTrunc(Den, I32Ty); 12215ffd83dbSDimitry Andric 12220b57cec5SDimitry Andric Type *F32Ty = Builder.getFloatTy(); 12230b57cec5SDimitry Andric ConstantInt *One = Builder.getInt32(1); 12240b57cec5SDimitry Andric Value *JQ = One; 12250b57cec5SDimitry Andric 12260b57cec5SDimitry Andric if (IsSigned) { 12270b57cec5SDimitry Andric // char|short jq = ia ^ ib; 12280b57cec5SDimitry Andric JQ = Builder.CreateXor(Num, Den); 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric // jq = jq >> (bitsize - 2) 12310b57cec5SDimitry Andric JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 12320b57cec5SDimitry Andric 12330b57cec5SDimitry Andric // jq = jq | 0x1 12340b57cec5SDimitry Andric JQ = Builder.CreateOr(JQ, One); 12350b57cec5SDimitry Andric } 12360b57cec5SDimitry Andric 12370b57cec5SDimitry Andric // int ia = (int)LHS; 12380b57cec5SDimitry Andric Value *IA = Num; 12390b57cec5SDimitry Andric 12400b57cec5SDimitry Andric // int ib, (int)RHS; 12410b57cec5SDimitry Andric Value *IB = Den; 12420b57cec5SDimitry Andric 12430b57cec5SDimitry Andric // float fa = (float)ia; 12440b57cec5SDimitry Andric Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 12450b57cec5SDimitry Andric : Builder.CreateUIToFP(IA, F32Ty); 12460b57cec5SDimitry Andric 12470b57cec5SDimitry Andric // float fb = (float)ib; 12480b57cec5SDimitry Andric Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 12490b57cec5SDimitry Andric : Builder.CreateUIToFP(IB,F32Ty); 12500b57cec5SDimitry Andric 12515ffd83dbSDimitry Andric Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, 12525ffd83dbSDimitry Andric Builder.getFloatTy()); 12535ffd83dbSDimitry Andric Value *RCP = Builder.CreateCall(RcpDecl, { FB }); 12540b57cec5SDimitry Andric Value *FQM = Builder.CreateFMul(FA, RCP); 12550b57cec5SDimitry Andric 12560b57cec5SDimitry Andric // fq = trunc(fqm); 12570b57cec5SDimitry Andric CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 12580b57cec5SDimitry Andric FQ->copyFastMathFlags(Builder.getFastMathFlags()); 12590b57cec5SDimitry Andric 12600b57cec5SDimitry Andric // float fqneg = -fq; 12610b57cec5SDimitry Andric Value *FQNeg = Builder.CreateFNeg(FQ); 12620b57cec5SDimitry Andric 12630b57cec5SDimitry Andric // float fr = mad(fqneg, fb, fa); 12645ffd83dbSDimitry Andric auto FMAD = !ST->hasMadMacF32Insts() 12655ffd83dbSDimitry Andric ? Intrinsic::fma 12665ffd83dbSDimitry Andric : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz; 12675ffd83dbSDimitry Andric Value *FR = Builder.CreateIntrinsic(FMAD, 12680b57cec5SDimitry Andric {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 12690b57cec5SDimitry Andric 12700b57cec5SDimitry Andric // int iq = (int)fq; 12710b57cec5SDimitry Andric Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 12720b57cec5SDimitry Andric : Builder.CreateFPToUI(FQ, I32Ty); 12730b57cec5SDimitry Andric 12740b57cec5SDimitry Andric // fr = fabs(fr); 12750b57cec5SDimitry Andric FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric // fb = fabs(fb); 12780b57cec5SDimitry Andric FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 12790b57cec5SDimitry Andric 12800b57cec5SDimitry Andric // int cv = fr >= fb; 12810b57cec5SDimitry Andric Value *CV = Builder.CreateFCmpOGE(FR, FB); 12820b57cec5SDimitry Andric 12830b57cec5SDimitry Andric // jq = (cv ? jq : 0); 12840b57cec5SDimitry Andric JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 12850b57cec5SDimitry Andric 12860b57cec5SDimitry Andric // dst = iq + jq; 12870b57cec5SDimitry Andric Value *Div = Builder.CreateAdd(IQ, JQ); 12880b57cec5SDimitry Andric 12890b57cec5SDimitry Andric Value *Res = Div; 12900b57cec5SDimitry Andric if (!IsDiv) { 12910b57cec5SDimitry Andric // Rem needs compensation, it's easier to recompute it 12920b57cec5SDimitry Andric Value *Rem = Builder.CreateMul(Div, Den); 12930b57cec5SDimitry Andric Res = Builder.CreateSub(Num, Rem); 12940b57cec5SDimitry Andric } 12950b57cec5SDimitry Andric 12965ffd83dbSDimitry Andric if (DivBits != 0 && DivBits < 32) { 12975ffd83dbSDimitry Andric // Extend in register from the number of bits this divide really is. 12980b57cec5SDimitry Andric if (IsSigned) { 12995ffd83dbSDimitry Andric int InRegBits = 32 - DivBits; 13005ffd83dbSDimitry Andric 13015ffd83dbSDimitry Andric Res = Builder.CreateShl(Res, InRegBits); 13025ffd83dbSDimitry Andric Res = Builder.CreateAShr(Res, InRegBits); 13030b57cec5SDimitry Andric } else { 13045ffd83dbSDimitry Andric ConstantInt *TruncMask 13055ffd83dbSDimitry Andric = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 13060b57cec5SDimitry Andric Res = Builder.CreateAnd(Res, TruncMask); 13070b57cec5SDimitry Andric } 13085ffd83dbSDimitry Andric } 13090b57cec5SDimitry Andric 13100b57cec5SDimitry Andric return Res; 13110b57cec5SDimitry Andric } 13120b57cec5SDimitry Andric 13135ffd83dbSDimitry Andric // Try to recognize special cases the DAG will emit special, better expansions 13145ffd83dbSDimitry Andric // than the general expansion we do here. 13155ffd83dbSDimitry Andric 13165ffd83dbSDimitry Andric // TODO: It would be better to just directly handle those optimizations here. 1317*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, 1318*06c3fb27SDimitry Andric Value *Num, 1319*06c3fb27SDimitry Andric Value *Den) const { 13205ffd83dbSDimitry Andric if (Constant *C = dyn_cast<Constant>(Den)) { 13215ffd83dbSDimitry Andric // Arbitrary constants get a better expansion as long as a wider mulhi is 13225ffd83dbSDimitry Andric // legal. 13235ffd83dbSDimitry Andric if (C->getType()->getScalarSizeInBits() <= 32) 13245ffd83dbSDimitry Andric return true; 13255ffd83dbSDimitry Andric 13265ffd83dbSDimitry Andric // TODO: Sdiv check for not exact for some reason. 13275ffd83dbSDimitry Andric 13285ffd83dbSDimitry Andric // If there's no wider mulhi, there's only a better expansion for powers of 13295ffd83dbSDimitry Andric // two. 13305ffd83dbSDimitry Andric // TODO: Should really know for each vector element. 13315ffd83dbSDimitry Andric if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT)) 13325ffd83dbSDimitry Andric return true; 13335ffd83dbSDimitry Andric 13345ffd83dbSDimitry Andric return false; 13355ffd83dbSDimitry Andric } 13365ffd83dbSDimitry Andric 13375ffd83dbSDimitry Andric if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) { 13385ffd83dbSDimitry Andric // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 13395ffd83dbSDimitry Andric if (BinOpDen->getOpcode() == Instruction::Shl && 13405ffd83dbSDimitry Andric isa<Constant>(BinOpDen->getOperand(0)) && 13415ffd83dbSDimitry Andric isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, 13425ffd83dbSDimitry Andric 0, AC, &I, DT)) { 13435ffd83dbSDimitry Andric return true; 13445ffd83dbSDimitry Andric } 13455ffd83dbSDimitry Andric } 13465ffd83dbSDimitry Andric 13475ffd83dbSDimitry Andric return false; 13485ffd83dbSDimitry Andric } 13495ffd83dbSDimitry Andric 13505ffd83dbSDimitry Andric static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { 13515ffd83dbSDimitry Andric // Check whether the sign can be determined statically. 13525ffd83dbSDimitry Andric KnownBits Known = computeKnownBits(V, *DL); 13535ffd83dbSDimitry Andric if (Known.isNegative()) 13545ffd83dbSDimitry Andric return Constant::getAllOnesValue(V->getType()); 13555ffd83dbSDimitry Andric if (Known.isNonNegative()) 13565ffd83dbSDimitry Andric return Constant::getNullValue(V->getType()); 13575ffd83dbSDimitry Andric return Builder.CreateAShr(V, Builder.getInt32(31)); 13585ffd83dbSDimitry Andric } 13595ffd83dbSDimitry Andric 1360*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, 13615ffd83dbSDimitry Andric BinaryOperator &I, Value *X, 13625ffd83dbSDimitry Andric Value *Y) const { 13630b57cec5SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 13640b57cec5SDimitry Andric assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 13650b57cec5SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv); 13660b57cec5SDimitry Andric 13670b57cec5SDimitry Andric FastMathFlags FMF; 13680b57cec5SDimitry Andric FMF.setFast(); 13690b57cec5SDimitry Andric Builder.setFastMathFlags(FMF); 13700b57cec5SDimitry Andric 13715ffd83dbSDimitry Andric if (divHasSpecialOptimization(I, X, Y)) 13725ffd83dbSDimitry Andric return nullptr; // Keep it for later optimization. 13730b57cec5SDimitry Andric 13740b57cec5SDimitry Andric bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 13750b57cec5SDimitry Andric bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 13760b57cec5SDimitry Andric 13775ffd83dbSDimitry Andric Type *Ty = X->getType(); 13780b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 13790b57cec5SDimitry Andric Type *F32Ty = Builder.getFloatTy(); 13800b57cec5SDimitry Andric 13810b57cec5SDimitry Andric if (Ty->getScalarSizeInBits() < 32) { 13820b57cec5SDimitry Andric if (IsSigned) { 13835ffd83dbSDimitry Andric X = Builder.CreateSExt(X, I32Ty); 13845ffd83dbSDimitry Andric Y = Builder.CreateSExt(Y, I32Ty); 13850b57cec5SDimitry Andric } else { 13865ffd83dbSDimitry Andric X = Builder.CreateZExt(X, I32Ty); 13875ffd83dbSDimitry Andric Y = Builder.CreateZExt(Y, I32Ty); 13880b57cec5SDimitry Andric } 13890b57cec5SDimitry Andric } 13900b57cec5SDimitry Andric 13915ffd83dbSDimitry Andric if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) { 13925ffd83dbSDimitry Andric return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : 13935ffd83dbSDimitry Andric Builder.CreateZExtOrTrunc(Res, Ty); 13940b57cec5SDimitry Andric } 13950b57cec5SDimitry Andric 13960b57cec5SDimitry Andric ConstantInt *Zero = Builder.getInt32(0); 13970b57cec5SDimitry Andric ConstantInt *One = Builder.getInt32(1); 13980b57cec5SDimitry Andric 13990b57cec5SDimitry Andric Value *Sign = nullptr; 14000b57cec5SDimitry Andric if (IsSigned) { 14015ffd83dbSDimitry Andric Value *SignX = getSign32(X, Builder, DL); 14025ffd83dbSDimitry Andric Value *SignY = getSign32(Y, Builder, DL); 14030b57cec5SDimitry Andric // Remainder sign is the same as LHS 14045ffd83dbSDimitry Andric Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX; 14050b57cec5SDimitry Andric 14065ffd83dbSDimitry Andric X = Builder.CreateAdd(X, SignX); 14075ffd83dbSDimitry Andric Y = Builder.CreateAdd(Y, SignY); 14080b57cec5SDimitry Andric 14095ffd83dbSDimitry Andric X = Builder.CreateXor(X, SignX); 14105ffd83dbSDimitry Andric Y = Builder.CreateXor(Y, SignY); 14110b57cec5SDimitry Andric } 14120b57cec5SDimitry Andric 14135ffd83dbSDimitry Andric // The algorithm here is based on ideas from "Software Integer Division", Tom 14145ffd83dbSDimitry Andric // Rodeheffer, August 2008. 14155ffd83dbSDimitry Andric // 14165ffd83dbSDimitry Andric // unsigned udiv(unsigned x, unsigned y) { 14175ffd83dbSDimitry Andric // // Initial estimate of inv(y). The constant is less than 2^32 to ensure 14185ffd83dbSDimitry Andric // // that this is a lower bound on inv(y), even if some of the calculations 14195ffd83dbSDimitry Andric // // round up. 14205ffd83dbSDimitry Andric // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y)); 14215ffd83dbSDimitry Andric // 14225ffd83dbSDimitry Andric // // One round of UNR (Unsigned integer Newton-Raphson) to improve z. 14235ffd83dbSDimitry Andric // // Empirically this is guaranteed to give a "two-y" lower bound on 14245ffd83dbSDimitry Andric // // inv(y). 14255ffd83dbSDimitry Andric // z += umulh(z, -y * z); 14265ffd83dbSDimitry Andric // 14275ffd83dbSDimitry Andric // // Quotient/remainder estimate. 14285ffd83dbSDimitry Andric // unsigned q = umulh(x, z); 14295ffd83dbSDimitry Andric // unsigned r = x - q * y; 14305ffd83dbSDimitry Andric // 14315ffd83dbSDimitry Andric // // Two rounds of quotient/remainder refinement. 14325ffd83dbSDimitry Andric // if (r >= y) { 14335ffd83dbSDimitry Andric // ++q; 14345ffd83dbSDimitry Andric // r -= y; 14355ffd83dbSDimitry Andric // } 14365ffd83dbSDimitry Andric // if (r >= y) { 14375ffd83dbSDimitry Andric // ++q; 14385ffd83dbSDimitry Andric // r -= y; 14395ffd83dbSDimitry Andric // } 14405ffd83dbSDimitry Andric // 14415ffd83dbSDimitry Andric // return q; 14425ffd83dbSDimitry Andric // } 14430b57cec5SDimitry Andric 14445ffd83dbSDimitry Andric // Initial estimate of inv(y). 14455ffd83dbSDimitry Andric Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); 14465ffd83dbSDimitry Andric Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); 14475ffd83dbSDimitry Andric Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); 1448*06c3fb27SDimitry Andric Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE)); 14495ffd83dbSDimitry Andric Value *ScaledY = Builder.CreateFMul(RcpY, Scale); 14505ffd83dbSDimitry Andric Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); 14510b57cec5SDimitry Andric 14525ffd83dbSDimitry Andric // One round of UNR. 14535ffd83dbSDimitry Andric Value *NegY = Builder.CreateSub(Zero, Y); 14545ffd83dbSDimitry Andric Value *NegYZ = Builder.CreateMul(NegY, Z); 14555ffd83dbSDimitry Andric Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ)); 14560b57cec5SDimitry Andric 14575ffd83dbSDimitry Andric // Quotient/remainder estimate. 14585ffd83dbSDimitry Andric Value *Q = getMulHu(Builder, X, Z); 14595ffd83dbSDimitry Andric Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y)); 14600b57cec5SDimitry Andric 14615ffd83dbSDimitry Andric // First quotient/remainder refinement. 14625ffd83dbSDimitry Andric Value *Cond = Builder.CreateICmpUGE(R, Y); 14635ffd83dbSDimitry Andric if (IsDiv) 14645ffd83dbSDimitry Andric Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); 14655ffd83dbSDimitry Andric R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); 14660b57cec5SDimitry Andric 14675ffd83dbSDimitry Andric // Second quotient/remainder refinement. 14685ffd83dbSDimitry Andric Cond = Builder.CreateICmpUGE(R, Y); 14690b57cec5SDimitry Andric Value *Res; 14705ffd83dbSDimitry Andric if (IsDiv) 14715ffd83dbSDimitry Andric Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); 14725ffd83dbSDimitry Andric else 14735ffd83dbSDimitry Andric Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); 14740b57cec5SDimitry Andric 14750b57cec5SDimitry Andric if (IsSigned) { 14760b57cec5SDimitry Andric Res = Builder.CreateXor(Res, Sign); 14770b57cec5SDimitry Andric Res = Builder.CreateSub(Res, Sign); 14780b57cec5SDimitry Andric } 14790b57cec5SDimitry Andric 14800b57cec5SDimitry Andric Res = Builder.CreateTrunc(Res, Ty); 14810b57cec5SDimitry Andric 14820b57cec5SDimitry Andric return Res; 14830b57cec5SDimitry Andric } 14840b57cec5SDimitry Andric 1485*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder, 1486*06c3fb27SDimitry Andric BinaryOperator &I, Value *Num, 1487*06c3fb27SDimitry Andric Value *Den) const { 14885ffd83dbSDimitry Andric if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) 14895ffd83dbSDimitry Andric return nullptr; // Keep it for later optimization. 14905ffd83dbSDimitry Andric 14915ffd83dbSDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 14925ffd83dbSDimitry Andric 14935ffd83dbSDimitry Andric bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; 14945ffd83dbSDimitry Andric bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; 14955ffd83dbSDimitry Andric 14965ffd83dbSDimitry Andric int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); 14975ffd83dbSDimitry Andric if (NumDivBits == -1) 14985ffd83dbSDimitry Andric return nullptr; 14995ffd83dbSDimitry Andric 15005ffd83dbSDimitry Andric Value *Narrowed = nullptr; 15015ffd83dbSDimitry Andric if (NumDivBits <= 24) { 15025ffd83dbSDimitry Andric Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, 15035ffd83dbSDimitry Andric IsDiv, IsSigned); 15045ffd83dbSDimitry Andric } else if (NumDivBits <= 32) { 15055ffd83dbSDimitry Andric Narrowed = expandDivRem32(Builder, I, Num, Den); 15065ffd83dbSDimitry Andric } 15075ffd83dbSDimitry Andric 15085ffd83dbSDimitry Andric if (Narrowed) { 15095ffd83dbSDimitry Andric return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : 15105ffd83dbSDimitry Andric Builder.CreateZExt(Narrowed, Num->getType()); 15115ffd83dbSDimitry Andric } 15125ffd83dbSDimitry Andric 15135ffd83dbSDimitry Andric return nullptr; 15145ffd83dbSDimitry Andric } 15155ffd83dbSDimitry Andric 1516*06c3fb27SDimitry Andric void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { 15175ffd83dbSDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 15185ffd83dbSDimitry Andric // Do the general expansion. 15195ffd83dbSDimitry Andric if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { 15205ffd83dbSDimitry Andric expandDivisionUpTo64Bits(&I); 15215ffd83dbSDimitry Andric return; 15225ffd83dbSDimitry Andric } 15235ffd83dbSDimitry Andric 15245ffd83dbSDimitry Andric if (Opc == Instruction::URem || Opc == Instruction::SRem) { 15255ffd83dbSDimitry Andric expandRemainderUpTo64Bits(&I); 15265ffd83dbSDimitry Andric return; 15275ffd83dbSDimitry Andric } 15285ffd83dbSDimitry Andric 15295ffd83dbSDimitry Andric llvm_unreachable("not a division"); 15305ffd83dbSDimitry Andric } 15315ffd83dbSDimitry Andric 1532*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { 15335ffd83dbSDimitry Andric if (foldBinOpIntoSelect(I)) 15345ffd83dbSDimitry Andric return true; 15355ffd83dbSDimitry Andric 15360b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1537*06c3fb27SDimitry Andric UA->isUniform(&I) && promoteUniformOpToI32(I)) 15380b57cec5SDimitry Andric return true; 15390b57cec5SDimitry Andric 15408bcb0991SDimitry Andric if (UseMul24Intrin && replaceMulWithMul24(I)) 15410b57cec5SDimitry Andric return true; 15420b57cec5SDimitry Andric 15430b57cec5SDimitry Andric bool Changed = false; 15440b57cec5SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 15450b57cec5SDimitry Andric Type *Ty = I.getType(); 15460b57cec5SDimitry Andric Value *NewDiv = nullptr; 15475ffd83dbSDimitry Andric unsigned ScalarSize = Ty->getScalarSizeInBits(); 15485ffd83dbSDimitry Andric 15495ffd83dbSDimitry Andric SmallVector<BinaryOperator *, 8> Div64ToExpand; 15505ffd83dbSDimitry Andric 15510b57cec5SDimitry Andric if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 15520b57cec5SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv) && 15535ffd83dbSDimitry Andric ScalarSize <= 64 && 15545ffd83dbSDimitry Andric !DisableIDivExpand) { 15550b57cec5SDimitry Andric Value *Num = I.getOperand(0); 15560b57cec5SDimitry Andric Value *Den = I.getOperand(1); 15570b57cec5SDimitry Andric IRBuilder<> Builder(&I); 15580b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 15590b57cec5SDimitry Andric 15605ffd83dbSDimitry Andric if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 1561bdd1243dSDimitry Andric NewDiv = PoisonValue::get(VT); 15620b57cec5SDimitry Andric 15630b57cec5SDimitry Andric for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 15640b57cec5SDimitry Andric Value *NumEltN = Builder.CreateExtractElement(Num, N); 15650b57cec5SDimitry Andric Value *DenEltN = Builder.CreateExtractElement(Den, N); 15665ffd83dbSDimitry Andric 15675ffd83dbSDimitry Andric Value *NewElt; 15685ffd83dbSDimitry Andric if (ScalarSize <= 32) { 15695ffd83dbSDimitry Andric NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 15700b57cec5SDimitry Andric if (!NewElt) 15710b57cec5SDimitry Andric NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 15725ffd83dbSDimitry Andric } else { 15735ffd83dbSDimitry Andric // See if this 64-bit division can be shrunk to 32/24-bits before 15745ffd83dbSDimitry Andric // producing the general expansion. 15755ffd83dbSDimitry Andric NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); 15765ffd83dbSDimitry Andric if (!NewElt) { 15775ffd83dbSDimitry Andric // The general 64-bit expansion introduces control flow and doesn't 15785ffd83dbSDimitry Andric // return the new value. Just insert a scalar copy and defer 15795ffd83dbSDimitry Andric // expanding it. 15805ffd83dbSDimitry Andric NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 15815ffd83dbSDimitry Andric Div64ToExpand.push_back(cast<BinaryOperator>(NewElt)); 15825ffd83dbSDimitry Andric } 15835ffd83dbSDimitry Andric } 15845ffd83dbSDimitry Andric 15850b57cec5SDimitry Andric NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 15860b57cec5SDimitry Andric } 15870b57cec5SDimitry Andric } else { 15885ffd83dbSDimitry Andric if (ScalarSize <= 32) 15890b57cec5SDimitry Andric NewDiv = expandDivRem32(Builder, I, Num, Den); 15905ffd83dbSDimitry Andric else { 15915ffd83dbSDimitry Andric NewDiv = shrinkDivRem64(Builder, I, Num, Den); 15925ffd83dbSDimitry Andric if (!NewDiv) 15935ffd83dbSDimitry Andric Div64ToExpand.push_back(&I); 15945ffd83dbSDimitry Andric } 15950b57cec5SDimitry Andric } 15960b57cec5SDimitry Andric 15970b57cec5SDimitry Andric if (NewDiv) { 15980b57cec5SDimitry Andric I.replaceAllUsesWith(NewDiv); 15990b57cec5SDimitry Andric I.eraseFromParent(); 16000b57cec5SDimitry Andric Changed = true; 16010b57cec5SDimitry Andric } 16020b57cec5SDimitry Andric } 16030b57cec5SDimitry Andric 16045ffd83dbSDimitry Andric if (ExpandDiv64InIR) { 16055ffd83dbSDimitry Andric // TODO: We get much worse code in specially handled constant cases. 16065ffd83dbSDimitry Andric for (BinaryOperator *Div : Div64ToExpand) { 16075ffd83dbSDimitry Andric expandDivRem64(*Div); 1608*06c3fb27SDimitry Andric FlowChanged = true; 16095ffd83dbSDimitry Andric Changed = true; 16105ffd83dbSDimitry Andric } 16115ffd83dbSDimitry Andric } 16125ffd83dbSDimitry Andric 16130b57cec5SDimitry Andric return Changed; 16140b57cec5SDimitry Andric } 16150b57cec5SDimitry Andric 1616*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { 16170b57cec5SDimitry Andric if (!WidenLoads) 16180b57cec5SDimitry Andric return false; 16190b57cec5SDimitry Andric 16200b57cec5SDimitry Andric if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 16210b57cec5SDimitry Andric I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 16220b57cec5SDimitry Andric canWidenScalarExtLoad(I)) { 16230b57cec5SDimitry Andric IRBuilder<> Builder(&I); 16240b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 16250b57cec5SDimitry Andric 16260b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 1627*06c3fb27SDimitry Andric LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand()); 16280b57cec5SDimitry Andric WidenLoad->copyMetadata(I); 16290b57cec5SDimitry Andric 16300b57cec5SDimitry Andric // If we have range metadata, we need to convert the type, and not make 16310b57cec5SDimitry Andric // assumptions about the high bits. 16320b57cec5SDimitry Andric if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 16330b57cec5SDimitry Andric ConstantInt *Lower = 16340b57cec5SDimitry Andric mdconst::extract<ConstantInt>(Range->getOperand(0)); 16350b57cec5SDimitry Andric 1636349cc55cSDimitry Andric if (Lower->isNullValue()) { 16370b57cec5SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 16380b57cec5SDimitry Andric } else { 16390b57cec5SDimitry Andric Metadata *LowAndHigh[] = { 16400b57cec5SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 16410b57cec5SDimitry Andric // Don't make assumptions about the high bits. 16420b57cec5SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 16430b57cec5SDimitry Andric }; 16440b57cec5SDimitry Andric 16450b57cec5SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range, 16460b57cec5SDimitry Andric MDNode::get(Mod->getContext(), LowAndHigh)); 16470b57cec5SDimitry Andric } 16480b57cec5SDimitry Andric } 16490b57cec5SDimitry Andric 16500b57cec5SDimitry Andric int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 16510b57cec5SDimitry Andric Type *IntNTy = Builder.getIntNTy(TySize); 16520b57cec5SDimitry Andric Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 16530b57cec5SDimitry Andric Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 16540b57cec5SDimitry Andric I.replaceAllUsesWith(ValOrig); 16550b57cec5SDimitry Andric I.eraseFromParent(); 16560b57cec5SDimitry Andric return true; 16570b57cec5SDimitry Andric } 16580b57cec5SDimitry Andric 16590b57cec5SDimitry Andric return false; 16600b57cec5SDimitry Andric } 16610b57cec5SDimitry Andric 1662*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { 16630b57cec5SDimitry Andric bool Changed = false; 16640b57cec5SDimitry Andric 16650b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 1666*06c3fb27SDimitry Andric UA->isUniform(&I)) 16670b57cec5SDimitry Andric Changed |= promoteUniformOpToI32(I); 16680b57cec5SDimitry Andric 16690b57cec5SDimitry Andric return Changed; 16700b57cec5SDimitry Andric } 16710b57cec5SDimitry Andric 1672*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { 1673*06c3fb27SDimitry Andric Value *Cond = I.getCondition(); 1674*06c3fb27SDimitry Andric Value *TrueVal = I.getTrueValue(); 1675*06c3fb27SDimitry Andric Value *FalseVal = I.getFalseValue(); 1676*06c3fb27SDimitry Andric Value *CmpVal; 1677*06c3fb27SDimitry Andric FCmpInst::Predicate Pred; 16780b57cec5SDimitry Andric 1679*06c3fb27SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) { 1680*06c3fb27SDimitry Andric if (UA->isUniform(&I)) 1681*06c3fb27SDimitry Andric return promoteUniformOpToI32(I); 1682*06c3fb27SDimitry Andric return false; 16830b57cec5SDimitry Andric } 16840b57cec5SDimitry Andric 1685*06c3fb27SDimitry Andric // Match fract pattern with nan check. 1686*06c3fb27SDimitry Andric if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN()))) 1687*06c3fb27SDimitry Andric return false; 1688*06c3fb27SDimitry Andric 1689*06c3fb27SDimitry Andric FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I); 1690*06c3fb27SDimitry Andric if (!FPOp) 1691*06c3fb27SDimitry Andric return false; 1692*06c3fb27SDimitry Andric 1693*06c3fb27SDimitry Andric IRBuilder<> Builder(&I); 1694*06c3fb27SDimitry Andric Builder.setFastMathFlags(FPOp->getFastMathFlags()); 1695*06c3fb27SDimitry Andric 1696*06c3fb27SDimitry Andric auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal); 1697*06c3fb27SDimitry Andric auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal); 1698*06c3fb27SDimitry Andric 1699*06c3fb27SDimitry Andric Value *Fract = nullptr; 1700*06c3fb27SDimitry Andric if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse && 1701*06c3fb27SDimitry Andric CmpVal == matchFractPat(*IIFalse)) { 1702*06c3fb27SDimitry Andric // isnan(x) ? x : fract(x) 1703*06c3fb27SDimitry Andric Fract = applyFractPat(Builder, CmpVal); 1704*06c3fb27SDimitry Andric } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue && 1705*06c3fb27SDimitry Andric CmpVal == matchFractPat(*IITrue)) { 1706*06c3fb27SDimitry Andric // !isnan(x) ? fract(x) : x 1707*06c3fb27SDimitry Andric Fract = applyFractPat(Builder, CmpVal); 1708*06c3fb27SDimitry Andric } else 1709*06c3fb27SDimitry Andric return false; 1710*06c3fb27SDimitry Andric 1711*06c3fb27SDimitry Andric Fract->takeName(&I); 1712*06c3fb27SDimitry Andric I.replaceAllUsesWith(Fract); 1713*06c3fb27SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); 1714*06c3fb27SDimitry Andric return true; 1715*06c3fb27SDimitry Andric } 1716*06c3fb27SDimitry Andric 1717*06c3fb27SDimitry Andric static bool areInSameBB(const Value *A, const Value *B) { 1718*06c3fb27SDimitry Andric const auto *IA = dyn_cast<Instruction>(A); 1719*06c3fb27SDimitry Andric const auto *IB = dyn_cast<Instruction>(B); 1720*06c3fb27SDimitry Andric return IA && IB && IA->getParent() == IB->getParent(); 1721*06c3fb27SDimitry Andric } 1722*06c3fb27SDimitry Andric 1723*06c3fb27SDimitry Andric // Helper for breaking large PHIs that returns true when an extractelement on V 1724*06c3fb27SDimitry Andric // is likely to be folded away by the DAG combiner. 1725*06c3fb27SDimitry Andric static bool isInterestingPHIIncomingValue(const Value *V) { 1726*06c3fb27SDimitry Andric const auto *FVT = dyn_cast<FixedVectorType>(V->getType()); 1727*06c3fb27SDimitry Andric if (!FVT) 1728*06c3fb27SDimitry Andric return false; 1729*06c3fb27SDimitry Andric 1730*06c3fb27SDimitry Andric const Value *CurVal = V; 1731*06c3fb27SDimitry Andric 1732*06c3fb27SDimitry Andric // Check for insertelements, keeping track of the elements covered. 1733*06c3fb27SDimitry Andric BitVector EltsCovered(FVT->getNumElements()); 1734*06c3fb27SDimitry Andric while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) { 1735*06c3fb27SDimitry Andric const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2)); 1736*06c3fb27SDimitry Andric 1737*06c3fb27SDimitry Andric // Non constant index/out of bounds index -> folding is unlikely. 1738*06c3fb27SDimitry Andric // The latter is more of a sanity check because canonical IR should just 1739*06c3fb27SDimitry Andric // have replaced those with poison. 1740*06c3fb27SDimitry Andric if (!Idx || Idx->getSExtValue() >= FVT->getNumElements()) 1741*06c3fb27SDimitry Andric return false; 1742*06c3fb27SDimitry Andric 1743*06c3fb27SDimitry Andric const auto *VecSrc = IE->getOperand(0); 1744*06c3fb27SDimitry Andric 1745*06c3fb27SDimitry Andric // If the vector source is another instruction, it must be in the same basic 1746*06c3fb27SDimitry Andric // block. Otherwise, the DAGCombiner won't see the whole thing and is 1747*06c3fb27SDimitry Andric // unlikely to be able to do anything interesting here. 1748*06c3fb27SDimitry Andric if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE)) 1749*06c3fb27SDimitry Andric return false; 1750*06c3fb27SDimitry Andric 1751*06c3fb27SDimitry Andric CurVal = VecSrc; 1752*06c3fb27SDimitry Andric EltsCovered.set(Idx->getSExtValue()); 1753*06c3fb27SDimitry Andric 1754*06c3fb27SDimitry Andric // All elements covered. 1755*06c3fb27SDimitry Andric if (EltsCovered.all()) 1756*06c3fb27SDimitry Andric return true; 1757*06c3fb27SDimitry Andric } 1758*06c3fb27SDimitry Andric 1759*06c3fb27SDimitry Andric // We either didn't find a single insertelement, or the insertelement chain 1760*06c3fb27SDimitry Andric // ended before all elements were covered. Check for other interesting values. 1761*06c3fb27SDimitry Andric 1762*06c3fb27SDimitry Andric // Constants are always interesting because we can just constant fold the 1763*06c3fb27SDimitry Andric // extractelements. 1764*06c3fb27SDimitry Andric if (isa<Constant>(CurVal)) 1765*06c3fb27SDimitry Andric return true; 1766*06c3fb27SDimitry Andric 1767*06c3fb27SDimitry Andric // shufflevector is likely to be profitable if either operand is a constant, 1768*06c3fb27SDimitry Andric // or if either source is in the same block. 1769*06c3fb27SDimitry Andric // This is because shufflevector is most often lowered as a series of 1770*06c3fb27SDimitry Andric // insert/extract elements anyway. 1771*06c3fb27SDimitry Andric if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) { 1772*06c3fb27SDimitry Andric return isa<Constant>(SV->getOperand(1)) || 1773*06c3fb27SDimitry Andric areInSameBB(SV, SV->getOperand(0)) || 1774*06c3fb27SDimitry Andric areInSameBB(SV, SV->getOperand(1)); 1775*06c3fb27SDimitry Andric } 1776*06c3fb27SDimitry Andric 1777*06c3fb27SDimitry Andric return false; 1778*06c3fb27SDimitry Andric } 1779*06c3fb27SDimitry Andric 1780*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) { 1781*06c3fb27SDimitry Andric // Check in the cache, or add an entry for this node. 1782*06c3fb27SDimitry Andric // 1783*06c3fb27SDimitry Andric // We init with false because we consider all PHI nodes unbreakable until we 1784*06c3fb27SDimitry Andric // reach a conclusion. Doing the opposite - assuming they're break-able until 1785*06c3fb27SDimitry Andric // proven otherwise - can be harmful in some pathological cases so we're 1786*06c3fb27SDimitry Andric // conservative for now. 1787*06c3fb27SDimitry Andric const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false}); 1788*06c3fb27SDimitry Andric if (!DidInsert) 1789*06c3fb27SDimitry Andric return It->second; 1790*06c3fb27SDimitry Andric 1791*06c3fb27SDimitry Andric // This function may recurse, so to guard against infinite looping, this PHI 1792*06c3fb27SDimitry Andric // is conservatively considered unbreakable until we reach a conclusion. 1793*06c3fb27SDimitry Andric 1794*06c3fb27SDimitry Andric // Don't break PHIs that have no interesting incoming values. That is, where 1795*06c3fb27SDimitry Andric // there is no clear opportunity to fold the "extractelement" instructions we 1796*06c3fb27SDimitry Andric // would add. 1797*06c3fb27SDimitry Andric // 1798*06c3fb27SDimitry Andric // Note: IC does not run after this pass, so we're only interested in the 1799*06c3fb27SDimitry Andric // foldings that the DAG combiner can do. 1800*06c3fb27SDimitry Andric if (none_of(I.incoming_values(), 1801*06c3fb27SDimitry Andric [&](Value *V) { return isInterestingPHIIncomingValue(V); })) 1802*06c3fb27SDimitry Andric return false; 1803*06c3fb27SDimitry Andric 1804*06c3fb27SDimitry Andric // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI 1805*06c3fb27SDimitry Andric // node as user, we don't want to break this PHI either because it's unlikely 1806*06c3fb27SDimitry Andric // to be beneficial. We would just explode the vector and reassemble it 1807*06c3fb27SDimitry Andric // directly, wasting instructions. 1808*06c3fb27SDimitry Andric // 1809*06c3fb27SDimitry Andric // In the case where multiple users are PHI nodes, we want at least half of 1810*06c3fb27SDimitry Andric // them to be breakable. 1811*06c3fb27SDimitry Andric int Score = 0; 1812*06c3fb27SDimitry Andric for (const Value *U : I.users()) { 1813*06c3fb27SDimitry Andric if (const auto *PU = dyn_cast<PHINode>(U)) 1814*06c3fb27SDimitry Andric Score += canBreakPHINode(*PU) ? 1 : -1; 1815*06c3fb27SDimitry Andric } 1816*06c3fb27SDimitry Andric 1817*06c3fb27SDimitry Andric if (Score < 0) 1818*06c3fb27SDimitry Andric return false; 1819*06c3fb27SDimitry Andric 1820*06c3fb27SDimitry Andric return BreakPhiNodesCache[&I] = true; 1821*06c3fb27SDimitry Andric } 1822*06c3fb27SDimitry Andric 1823*06c3fb27SDimitry Andric /// Helper class for "break large PHIs" (visitPHINode). 1824*06c3fb27SDimitry Andric /// 1825*06c3fb27SDimitry Andric /// This represents a slice of a PHI's incoming value, which is made up of: 1826*06c3fb27SDimitry Andric /// - The type of the slice (Ty) 1827*06c3fb27SDimitry Andric /// - The index in the incoming value's vector where the slice starts (Idx) 1828*06c3fb27SDimitry Andric /// - The number of elements in the slice (NumElts). 1829*06c3fb27SDimitry Andric /// It also keeps track of the NewPHI node inserted for this particular slice. 1830*06c3fb27SDimitry Andric /// 1831*06c3fb27SDimitry Andric /// Slice examples: 1832*06c3fb27SDimitry Andric /// <4 x i64> -> Split into four i64 slices. 1833*06c3fb27SDimitry Andric /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1] 1834*06c3fb27SDimitry Andric /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail. 1835*06c3fb27SDimitry Andric /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1] 1836*06c3fb27SDimitry Andric class VectorSlice { 1837*06c3fb27SDimitry Andric public: 1838*06c3fb27SDimitry Andric VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts) 1839*06c3fb27SDimitry Andric : Ty(Ty), Idx(Idx), NumElts(NumElts) {} 1840*06c3fb27SDimitry Andric 1841*06c3fb27SDimitry Andric Type *Ty = nullptr; 1842*06c3fb27SDimitry Andric unsigned Idx = 0; 1843*06c3fb27SDimitry Andric unsigned NumElts = 0; 1844*06c3fb27SDimitry Andric PHINode *NewPHI = nullptr; 1845*06c3fb27SDimitry Andric 1846*06c3fb27SDimitry Andric /// Slice \p Inc according to the information contained within this slice. 1847*06c3fb27SDimitry Andric /// This is cached, so if called multiple times for the same \p BB & \p Inc 1848*06c3fb27SDimitry Andric /// pair, it returns the same Sliced value as well. 1849*06c3fb27SDimitry Andric /// 1850*06c3fb27SDimitry Andric /// Note this *intentionally* does not return the same value for, say, 1851*06c3fb27SDimitry Andric /// [%bb.0, %0] & [%bb.1, %0] as: 1852*06c3fb27SDimitry Andric /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then 1853*06c3fb27SDimitry Andric /// the value in bb.1 may not be reachable from bb.0 if it's its 1854*06c3fb27SDimitry Andric /// predecessor.) 1855*06c3fb27SDimitry Andric /// - We also want to make our extract instructions as local as possible so 1856*06c3fb27SDimitry Andric /// the DAG has better chances of folding them out. Duplicating them like 1857*06c3fb27SDimitry Andric /// that is beneficial in that regard. 1858*06c3fb27SDimitry Andric /// 1859*06c3fb27SDimitry Andric /// This is both a minor optimization to avoid creating duplicate 1860*06c3fb27SDimitry Andric /// instructions, but also a requirement for correctness. It is not forbidden 1861*06c3fb27SDimitry Andric /// for a PHI node to have the same [BB, Val] pair multiple times. If we 1862*06c3fb27SDimitry Andric /// returned a new value each time, those previously identical pairs would all 1863*06c3fb27SDimitry Andric /// have different incoming values (from the same block) and it'd cause a "PHI 1864*06c3fb27SDimitry Andric /// node has multiple entries for the same basic block with different incoming 1865*06c3fb27SDimitry Andric /// values!" verifier error. 1866*06c3fb27SDimitry Andric Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) { 1867*06c3fb27SDimitry Andric Value *&Res = SlicedVals[{BB, Inc}]; 1868*06c3fb27SDimitry Andric if (Res) 1869*06c3fb27SDimitry Andric return Res; 1870*06c3fb27SDimitry Andric 1871*06c3fb27SDimitry Andric IRBuilder<> B(BB->getTerminator()); 1872*06c3fb27SDimitry Andric if (Instruction *IncInst = dyn_cast<Instruction>(Inc)) 1873*06c3fb27SDimitry Andric B.SetCurrentDebugLocation(IncInst->getDebugLoc()); 1874*06c3fb27SDimitry Andric 1875*06c3fb27SDimitry Andric if (NumElts > 1) { 1876*06c3fb27SDimitry Andric SmallVector<int, 4> Mask; 1877*06c3fb27SDimitry Andric for (unsigned K = Idx; K < (Idx + NumElts); ++K) 1878*06c3fb27SDimitry Andric Mask.push_back(K); 1879*06c3fb27SDimitry Andric Res = B.CreateShuffleVector(Inc, Mask, NewValName); 1880*06c3fb27SDimitry Andric } else 1881*06c3fb27SDimitry Andric Res = B.CreateExtractElement(Inc, Idx, NewValName); 1882*06c3fb27SDimitry Andric 1883*06c3fb27SDimitry Andric return Res; 1884*06c3fb27SDimitry Andric } 1885*06c3fb27SDimitry Andric 1886*06c3fb27SDimitry Andric private: 1887*06c3fb27SDimitry Andric SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals; 1888*06c3fb27SDimitry Andric }; 1889*06c3fb27SDimitry Andric 1890*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { 1891*06c3fb27SDimitry Andric // Break-up fixed-vector PHIs into smaller pieces. 1892*06c3fb27SDimitry Andric // Default threshold is 32, so it breaks up any vector that's >32 bits into 1893*06c3fb27SDimitry Andric // its elements, or into 32-bit pieces (for 8/16 bit elts). 1894*06c3fb27SDimitry Andric // 1895*06c3fb27SDimitry Andric // This is only helpful for DAGISel because it doesn't handle large PHIs as 1896*06c3fb27SDimitry Andric // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg. 1897*06c3fb27SDimitry Andric // With large, odd-sized PHIs we may end up needing many `build_vector` 1898*06c3fb27SDimitry Andric // operations with most elements being "undef". This inhibits a lot of 1899*06c3fb27SDimitry Andric // optimization opportunities and can result in unreasonably high register 1900*06c3fb27SDimitry Andric // pressure and the inevitable stack spilling. 1901*06c3fb27SDimitry Andric if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) 1902*06c3fb27SDimitry Andric return false; 1903*06c3fb27SDimitry Andric 1904*06c3fb27SDimitry Andric FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType()); 1905*06c3fb27SDimitry Andric if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) 1906*06c3fb27SDimitry Andric return false; 1907*06c3fb27SDimitry Andric 1908*06c3fb27SDimitry Andric if (!ForceScalarizeLargePHIs && !canBreakPHINode(I)) 1909*06c3fb27SDimitry Andric return false; 1910*06c3fb27SDimitry Andric 1911*06c3fb27SDimitry Andric std::vector<VectorSlice> Slices; 1912*06c3fb27SDimitry Andric 1913*06c3fb27SDimitry Andric Type *EltTy = FVT->getElementType(); 1914*06c3fb27SDimitry Andric { 1915*06c3fb27SDimitry Andric unsigned Idx = 0; 1916*06c3fb27SDimitry Andric // For 8/16 bits type, don't scalarize fully but break it up into as many 1917*06c3fb27SDimitry Andric // 32-bit slices as we can, and scalarize the tail. 1918*06c3fb27SDimitry Andric const unsigned EltSize = DL->getTypeSizeInBits(EltTy); 1919*06c3fb27SDimitry Andric const unsigned NumElts = FVT->getNumElements(); 1920*06c3fb27SDimitry Andric if (EltSize == 8 || EltSize == 16) { 1921*06c3fb27SDimitry Andric const unsigned SubVecSize = (32 / EltSize); 1922*06c3fb27SDimitry Andric Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize); 1923*06c3fb27SDimitry Andric for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End; 1924*06c3fb27SDimitry Andric Idx += SubVecSize) 1925*06c3fb27SDimitry Andric Slices.emplace_back(SubVecTy, Idx, SubVecSize); 1926*06c3fb27SDimitry Andric } 1927*06c3fb27SDimitry Andric 1928*06c3fb27SDimitry Andric // Scalarize all remaining elements. 1929*06c3fb27SDimitry Andric for (; Idx < NumElts; ++Idx) 1930*06c3fb27SDimitry Andric Slices.emplace_back(EltTy, Idx, 1); 1931*06c3fb27SDimitry Andric } 1932*06c3fb27SDimitry Andric 1933*06c3fb27SDimitry Andric if (Slices.size() == 1) 1934*06c3fb27SDimitry Andric return false; 1935*06c3fb27SDimitry Andric 1936*06c3fb27SDimitry Andric // Create one PHI per vector piece. The "VectorSlice" class takes care of 1937*06c3fb27SDimitry Andric // creating the necessary instruction to extract the relevant slices of each 1938*06c3fb27SDimitry Andric // incoming value. 1939*06c3fb27SDimitry Andric IRBuilder<> B(I.getParent()); 1940*06c3fb27SDimitry Andric B.SetCurrentDebugLocation(I.getDebugLoc()); 1941*06c3fb27SDimitry Andric 1942*06c3fb27SDimitry Andric unsigned IncNameSuffix = 0; 1943*06c3fb27SDimitry Andric for (VectorSlice &S : Slices) { 1944*06c3fb27SDimitry Andric // We need to reset the build on each iteration, because getSlicedVal may 1945*06c3fb27SDimitry Andric // have inserted something into I's BB. 1946*06c3fb27SDimitry Andric B.SetInsertPoint(I.getParent()->getFirstNonPHI()); 1947*06c3fb27SDimitry Andric S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues()); 1948*06c3fb27SDimitry Andric 1949*06c3fb27SDimitry Andric for (const auto &[Idx, BB] : enumerate(I.blocks())) { 1950*06c3fb27SDimitry Andric S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx), 1951*06c3fb27SDimitry Andric "largephi.extractslice" + 1952*06c3fb27SDimitry Andric std::to_string(IncNameSuffix++)), 1953*06c3fb27SDimitry Andric BB); 1954*06c3fb27SDimitry Andric } 1955*06c3fb27SDimitry Andric } 1956*06c3fb27SDimitry Andric 1957*06c3fb27SDimitry Andric // And replace this PHI with a vector of all the previous PHI values. 1958*06c3fb27SDimitry Andric Value *Vec = PoisonValue::get(FVT); 1959*06c3fb27SDimitry Andric unsigned NameSuffix = 0; 1960*06c3fb27SDimitry Andric for (VectorSlice &S : Slices) { 1961*06c3fb27SDimitry Andric const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++); 1962*06c3fb27SDimitry Andric if (S.NumElts > 1) 1963*06c3fb27SDimitry Andric Vec = 1964*06c3fb27SDimitry Andric B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName); 1965*06c3fb27SDimitry Andric else 1966*06c3fb27SDimitry Andric Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName); 1967*06c3fb27SDimitry Andric } 1968*06c3fb27SDimitry Andric 1969*06c3fb27SDimitry Andric I.replaceAllUsesWith(Vec); 1970*06c3fb27SDimitry Andric I.eraseFromParent(); 1971*06c3fb27SDimitry Andric return true; 1972*06c3fb27SDimitry Andric } 1973*06c3fb27SDimitry Andric 1974*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { 19750b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 19760b57cec5SDimitry Andric case Intrinsic::bitreverse: 19770b57cec5SDimitry Andric return visitBitreverseIntrinsicInst(I); 1978*06c3fb27SDimitry Andric case Intrinsic::minnum: 1979*06c3fb27SDimitry Andric return visitMinNum(I); 19800b57cec5SDimitry Andric default: 19810b57cec5SDimitry Andric return false; 19820b57cec5SDimitry Andric } 19830b57cec5SDimitry Andric } 19840b57cec5SDimitry Andric 1985*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 19860b57cec5SDimitry Andric bool Changed = false; 19870b57cec5SDimitry Andric 19880b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1989*06c3fb27SDimitry Andric UA->isUniform(&I)) 19900b57cec5SDimitry Andric Changed |= promoteUniformBitreverseToI32(I); 19910b57cec5SDimitry Andric 19920b57cec5SDimitry Andric return Changed; 19930b57cec5SDimitry Andric } 19940b57cec5SDimitry Andric 1995*06c3fb27SDimitry Andric /// Match non-nan fract pattern. 1996*06c3fb27SDimitry Andric /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0) 1997*06c3fb27SDimitry Andric /// 1998*06c3fb27SDimitry Andric /// If fract is a useful instruction for the subtarget. Does not account for the 1999*06c3fb27SDimitry Andric /// nan handling; the instruction has a nan check on the input value. 2000*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) { 2001*06c3fb27SDimitry Andric if (ST->hasFractBug()) 2002*06c3fb27SDimitry Andric return nullptr; 2003*06c3fb27SDimitry Andric 2004*06c3fb27SDimitry Andric if (I.getIntrinsicID() != Intrinsic::minnum) 2005*06c3fb27SDimitry Andric return nullptr; 2006*06c3fb27SDimitry Andric 2007*06c3fb27SDimitry Andric Type *Ty = I.getType(); 2008*06c3fb27SDimitry Andric if (!isLegalFloatingTy(Ty->getScalarType())) 2009*06c3fb27SDimitry Andric return nullptr; 2010*06c3fb27SDimitry Andric 2011*06c3fb27SDimitry Andric Value *Arg0 = I.getArgOperand(0); 2012*06c3fb27SDimitry Andric Value *Arg1 = I.getArgOperand(1); 2013*06c3fb27SDimitry Andric 2014*06c3fb27SDimitry Andric const APFloat *C; 2015*06c3fb27SDimitry Andric if (!match(Arg1, m_APFloat(C))) 2016*06c3fb27SDimitry Andric return nullptr; 2017*06c3fb27SDimitry Andric 2018*06c3fb27SDimitry Andric APFloat One(1.0); 2019*06c3fb27SDimitry Andric bool LosesInfo; 2020*06c3fb27SDimitry Andric One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo); 2021*06c3fb27SDimitry Andric 2022*06c3fb27SDimitry Andric // Match nextafter(1.0, -1) 2023*06c3fb27SDimitry Andric One.next(true); 2024*06c3fb27SDimitry Andric if (One != *C) 2025*06c3fb27SDimitry Andric return nullptr; 2026*06c3fb27SDimitry Andric 2027*06c3fb27SDimitry Andric Value *FloorSrc; 2028*06c3fb27SDimitry Andric if (match(Arg0, m_FSub(m_Value(FloorSrc), 2029*06c3fb27SDimitry Andric m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc))))) 2030*06c3fb27SDimitry Andric return FloorSrc; 2031*06c3fb27SDimitry Andric return nullptr; 2032*06c3fb27SDimitry Andric } 2033*06c3fb27SDimitry Andric 2034*06c3fb27SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, 2035*06c3fb27SDimitry Andric Value *FractArg) { 2036*06c3fb27SDimitry Andric SmallVector<Value *, 4> FractVals; 2037*06c3fb27SDimitry Andric extractValues(Builder, FractVals, FractArg); 2038*06c3fb27SDimitry Andric 2039*06c3fb27SDimitry Andric SmallVector<Value *, 4> ResultVals(FractVals.size()); 2040*06c3fb27SDimitry Andric 2041*06c3fb27SDimitry Andric Type *Ty = FractArg->getType()->getScalarType(); 2042*06c3fb27SDimitry Andric for (unsigned I = 0, E = FractVals.size(); I != E; ++I) { 2043*06c3fb27SDimitry Andric ResultVals[I] = 2044*06c3fb27SDimitry Andric Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]}); 2045*06c3fb27SDimitry Andric } 2046*06c3fb27SDimitry Andric 2047*06c3fb27SDimitry Andric return insertValues(Builder, FractArg->getType(), ResultVals); 2048*06c3fb27SDimitry Andric } 2049*06c3fb27SDimitry Andric 2050*06c3fb27SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) { 2051*06c3fb27SDimitry Andric Value *FractArg = matchFractPat(I); 2052*06c3fb27SDimitry Andric if (!FractArg) 2053*06c3fb27SDimitry Andric return false; 2054*06c3fb27SDimitry Andric 2055*06c3fb27SDimitry Andric // Match pattern for fract intrinsic in contexts where the nan check has been 2056*06c3fb27SDimitry Andric // optimized out (and hope the knowledge the source can't be nan wasn't lost). 2057*06c3fb27SDimitry Andric if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, *DL, TLInfo)) 2058*06c3fb27SDimitry Andric return false; 2059*06c3fb27SDimitry Andric 2060*06c3fb27SDimitry Andric IRBuilder<> Builder(&I); 2061*06c3fb27SDimitry Andric FastMathFlags FMF = I.getFastMathFlags(); 2062*06c3fb27SDimitry Andric FMF.setNoNaNs(); 2063*06c3fb27SDimitry Andric Builder.setFastMathFlags(FMF); 2064*06c3fb27SDimitry Andric 2065*06c3fb27SDimitry Andric Value *Fract = applyFractPat(Builder, FractArg); 2066*06c3fb27SDimitry Andric Fract->takeName(&I); 2067*06c3fb27SDimitry Andric I.replaceAllUsesWith(Fract); 2068*06c3fb27SDimitry Andric 2069*06c3fb27SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); 2070*06c3fb27SDimitry Andric return true; 2071*06c3fb27SDimitry Andric } 2072*06c3fb27SDimitry Andric 20730b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 2074*06c3fb27SDimitry Andric Impl.Mod = &M; 2075*06c3fb27SDimitry Andric Impl.DL = &Impl.Mod->getDataLayout(); 20760b57cec5SDimitry Andric return false; 20770b57cec5SDimitry Andric } 20780b57cec5SDimitry Andric 20790b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 20800b57cec5SDimitry Andric if (skipFunction(F)) 20810b57cec5SDimitry Andric return false; 20820b57cec5SDimitry Andric 20830b57cec5SDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 20840b57cec5SDimitry Andric if (!TPC) 20850b57cec5SDimitry Andric return false; 20860b57cec5SDimitry Andric 20870b57cec5SDimitry Andric const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 2088*06c3fb27SDimitry Andric Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 2089*06c3fb27SDimitry Andric Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); 2090*06c3fb27SDimitry Andric Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2091*06c3fb27SDimitry Andric Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 20925ffd83dbSDimitry Andric auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 2093*06c3fb27SDimitry Andric Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr; 2094*06c3fb27SDimitry Andric Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); 2095*06c3fb27SDimitry Andric SIModeRegisterDefaults Mode(F); 2096*06c3fb27SDimitry Andric Impl.HasFP32DenormalFlush = 2097*06c3fb27SDimitry Andric Mode.FP32Denormals == DenormalMode::getPreserveSign(); 2098*06c3fb27SDimitry Andric return Impl.run(F); 20990b57cec5SDimitry Andric } 21000b57cec5SDimitry Andric 2101*06c3fb27SDimitry Andric PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F, 2102*06c3fb27SDimitry Andric FunctionAnalysisManager &FAM) { 2103*06c3fb27SDimitry Andric AMDGPUCodeGenPrepareImpl Impl; 2104*06c3fb27SDimitry Andric Impl.Mod = F.getParent(); 2105*06c3fb27SDimitry Andric Impl.DL = &Impl.Mod->getDataLayout(); 2106*06c3fb27SDimitry Andric Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); 2107*06c3fb27SDimitry Andric Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); 2108*06c3fb27SDimitry Andric Impl.AC = &FAM.getResult<AssumptionAnalysis>(F); 2109*06c3fb27SDimitry Andric Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F); 2110*06c3fb27SDimitry Andric Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 2111*06c3fb27SDimitry Andric Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); 2112*06c3fb27SDimitry Andric SIModeRegisterDefaults Mode(F); 2113*06c3fb27SDimitry Andric Impl.HasFP32DenormalFlush = 2114*06c3fb27SDimitry Andric Mode.FP32Denormals == DenormalMode::getPreserveSign(); 2115*06c3fb27SDimitry Andric PreservedAnalyses PA = PreservedAnalyses::none(); 2116*06c3fb27SDimitry Andric if (!Impl.FlowChanged) 2117*06c3fb27SDimitry Andric PA.preserveSet<CFGAnalyses>(); 2118*06c3fb27SDimitry Andric return Impl.run(F) ? PA : PreservedAnalyses::all(); 21190b57cec5SDimitry Andric } 21200b57cec5SDimitry Andric 21210b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 21220b57cec5SDimitry Andric "AMDGPU IR optimizations", false, false) 21230b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 2124*06c3fb27SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 2125*06c3fb27SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 21260b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 21270b57cec5SDimitry Andric false, false) 21280b57cec5SDimitry Andric 21290b57cec5SDimitry Andric char AMDGPUCodeGenPrepare::ID = 0; 21300b57cec5SDimitry Andric 21310b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 21320b57cec5SDimitry Andric return new AMDGPUCodeGenPrepare(); 21330b57cec5SDimitry Andric } 2134