1*0b57cec5SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric // 9*0b57cec5SDimitry Andric /// \file 10*0b57cec5SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR before instruction 11*0b57cec5SDimitry Andric /// selection. 12*0b57cec5SDimitry Andric // 13*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14*0b57cec5SDimitry Andric 15*0b57cec5SDimitry Andric #include "AMDGPU.h" 16*0b57cec5SDimitry Andric #include "AMDGPUSubtarget.h" 17*0b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 18*0b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 19*0b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 20*0b57cec5SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 21*0b57cec5SDimitry Andric #include "llvm/Analysis/Loads.h" 22*0b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 23*0b57cec5SDimitry Andric #include "llvm/CodeGen/Passes.h" 24*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 25*0b57cec5SDimitry Andric #include "llvm/IR/Attributes.h" 26*0b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h" 27*0b57cec5SDimitry Andric #include "llvm/IR/Constants.h" 28*0b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 29*0b57cec5SDimitry Andric #include "llvm/IR/Function.h" 30*0b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 31*0b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h" 32*0b57cec5SDimitry Andric #include "llvm/IR/InstrTypes.h" 33*0b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 34*0b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 35*0b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 36*0b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h" 37*0b57cec5SDimitry Andric #include "llvm/IR/LLVMContext.h" 38*0b57cec5SDimitry Andric #include "llvm/IR/Operator.h" 39*0b57cec5SDimitry Andric #include "llvm/IR/Type.h" 40*0b57cec5SDimitry Andric #include "llvm/IR/Value.h" 41*0b57cec5SDimitry Andric #include "llvm/Pass.h" 42*0b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 43*0b57cec5SDimitry Andric #include <cassert> 44*0b57cec5SDimitry Andric #include <iterator> 45*0b57cec5SDimitry Andric 46*0b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-codegenprepare" 47*0b57cec5SDimitry Andric 48*0b57cec5SDimitry Andric using namespace llvm; 49*0b57cec5SDimitry Andric 50*0b57cec5SDimitry Andric namespace { 51*0b57cec5SDimitry Andric 52*0b57cec5SDimitry Andric static cl::opt<bool> WidenLoads( 53*0b57cec5SDimitry Andric "amdgpu-codegenprepare-widen-constant-loads", 54*0b57cec5SDimitry Andric cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 55*0b57cec5SDimitry Andric cl::ReallyHidden, 56*0b57cec5SDimitry Andric cl::init(true)); 57*0b57cec5SDimitry Andric 58*0b57cec5SDimitry Andric class AMDGPUCodeGenPrepare : public FunctionPass, 59*0b57cec5SDimitry Andric public InstVisitor<AMDGPUCodeGenPrepare, bool> { 60*0b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr; 61*0b57cec5SDimitry Andric AssumptionCache *AC = nullptr; 62*0b57cec5SDimitry Andric LegacyDivergenceAnalysis *DA = nullptr; 63*0b57cec5SDimitry Andric Module *Mod = nullptr; 64*0b57cec5SDimitry Andric const DataLayout *DL = nullptr; 65*0b57cec5SDimitry Andric bool HasUnsafeFPMath = false; 66*0b57cec5SDimitry Andric 67*0b57cec5SDimitry Andric /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 68*0b57cec5SDimitry Andric /// binary operation \p V. 69*0b57cec5SDimitry Andric /// 70*0b57cec5SDimitry Andric /// \returns Binary operation \p V. 71*0b57cec5SDimitry Andric /// \returns \p T's base element bit width. 72*0b57cec5SDimitry Andric unsigned getBaseElementBitWidth(const Type *T) const; 73*0b57cec5SDimitry Andric 74*0b57cec5SDimitry Andric /// \returns Equivalent 32 bit integer type for given type \p T. For example, 75*0b57cec5SDimitry Andric /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 76*0b57cec5SDimitry Andric /// is returned. 77*0b57cec5SDimitry Andric Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 78*0b57cec5SDimitry Andric 79*0b57cec5SDimitry Andric /// \returns True if binary operation \p I is a signed binary operation, false 80*0b57cec5SDimitry Andric /// otherwise. 81*0b57cec5SDimitry Andric bool isSigned(const BinaryOperator &I) const; 82*0b57cec5SDimitry Andric 83*0b57cec5SDimitry Andric /// \returns True if the condition of 'select' operation \p I comes from a 84*0b57cec5SDimitry Andric /// signed 'icmp' operation, false otherwise. 85*0b57cec5SDimitry Andric bool isSigned(const SelectInst &I) const; 86*0b57cec5SDimitry Andric 87*0b57cec5SDimitry Andric /// \returns True if type \p T needs to be promoted to 32 bit integer type, 88*0b57cec5SDimitry Andric /// false otherwise. 89*0b57cec5SDimitry Andric bool needsPromotionToI32(const Type *T) const; 90*0b57cec5SDimitry Andric 91*0b57cec5SDimitry Andric /// Promotes uniform binary operation \p I to equivalent 32 bit binary 92*0b57cec5SDimitry Andric /// operation. 93*0b57cec5SDimitry Andric /// 94*0b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 95*0b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 96*0b57cec5SDimitry Andric /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 97*0b57cec5SDimitry Andric /// truncating the result of 32 bit binary operation back to \p I's original 98*0b57cec5SDimitry Andric /// type. Division operation is not promoted. 99*0b57cec5SDimitry Andric /// 100*0b57cec5SDimitry Andric /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 101*0b57cec5SDimitry Andric /// false otherwise. 102*0b57cec5SDimitry Andric bool promoteUniformOpToI32(BinaryOperator &I) const; 103*0b57cec5SDimitry Andric 104*0b57cec5SDimitry Andric /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 105*0b57cec5SDimitry Andric /// 106*0b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 107*0b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 108*0b57cec5SDimitry Andric /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 109*0b57cec5SDimitry Andric /// 110*0b57cec5SDimitry Andric /// \returns True. 111*0b57cec5SDimitry Andric bool promoteUniformOpToI32(ICmpInst &I) const; 112*0b57cec5SDimitry Andric 113*0b57cec5SDimitry Andric /// Promotes uniform 'select' operation \p I to 32 bit 'select' 114*0b57cec5SDimitry Andric /// operation. 115*0b57cec5SDimitry Andric /// 116*0b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 117*0b57cec5SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to 118*0b57cec5SDimitry Andric /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 119*0b57cec5SDimitry Andric /// result of 32 bit 'select' operation back to \p I's original type. 120*0b57cec5SDimitry Andric /// 121*0b57cec5SDimitry Andric /// \returns True. 122*0b57cec5SDimitry Andric bool promoteUniformOpToI32(SelectInst &I) const; 123*0b57cec5SDimitry Andric 124*0b57cec5SDimitry Andric /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 125*0b57cec5SDimitry Andric /// intrinsic. 126*0b57cec5SDimitry Andric /// 127*0b57cec5SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less 128*0b57cec5SDimitry Andric /// than or equal 16. Promotion is done by zero extending the operand to 32 129*0b57cec5SDimitry Andric /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 130*0b57cec5SDimitry Andric /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 131*0b57cec5SDimitry Andric /// shift amount is 32 minus \p I's base element bit width), and truncating 132*0b57cec5SDimitry Andric /// the result of the shift operation back to \p I's original type. 133*0b57cec5SDimitry Andric /// 134*0b57cec5SDimitry Andric /// \returns True. 135*0b57cec5SDimitry Andric bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 136*0b57cec5SDimitry Andric 137*0b57cec5SDimitry Andric 138*0b57cec5SDimitry Andric unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; 139*0b57cec5SDimitry Andric unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; 140*0b57cec5SDimitry Andric bool isI24(Value *V, unsigned ScalarSize) const; 141*0b57cec5SDimitry Andric bool isU24(Value *V, unsigned ScalarSize) const; 142*0b57cec5SDimitry Andric 143*0b57cec5SDimitry Andric /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. 144*0b57cec5SDimitry Andric /// SelectionDAG has an issue where an and asserting the bits are known 145*0b57cec5SDimitry Andric bool replaceMulWithMul24(BinaryOperator &I) const; 146*0b57cec5SDimitry Andric 147*0b57cec5SDimitry Andric /// Expands 24 bit div or rem. 148*0b57cec5SDimitry Andric Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 149*0b57cec5SDimitry Andric Value *Num, Value *Den, 150*0b57cec5SDimitry Andric bool IsDiv, bool IsSigned) const; 151*0b57cec5SDimitry Andric 152*0b57cec5SDimitry Andric /// Expands 32 bit div or rem. 153*0b57cec5SDimitry Andric Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 154*0b57cec5SDimitry Andric Value *Num, Value *Den) const; 155*0b57cec5SDimitry Andric 156*0b57cec5SDimitry Andric /// Widen a scalar load. 157*0b57cec5SDimitry Andric /// 158*0b57cec5SDimitry Andric /// \details \p Widen scalar load for uniform, small type loads from constant 159*0b57cec5SDimitry Andric // memory / to a full 32-bits and then truncate the input to allow a scalar 160*0b57cec5SDimitry Andric // load instead of a vector load. 161*0b57cec5SDimitry Andric // 162*0b57cec5SDimitry Andric /// \returns True. 163*0b57cec5SDimitry Andric 164*0b57cec5SDimitry Andric bool canWidenScalarExtLoad(LoadInst &I) const; 165*0b57cec5SDimitry Andric 166*0b57cec5SDimitry Andric public: 167*0b57cec5SDimitry Andric static char ID; 168*0b57cec5SDimitry Andric 169*0b57cec5SDimitry Andric AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 170*0b57cec5SDimitry Andric 171*0b57cec5SDimitry Andric bool visitFDiv(BinaryOperator &I); 172*0b57cec5SDimitry Andric 173*0b57cec5SDimitry Andric bool visitInstruction(Instruction &I) { return false; } 174*0b57cec5SDimitry Andric bool visitBinaryOperator(BinaryOperator &I); 175*0b57cec5SDimitry Andric bool visitLoadInst(LoadInst &I); 176*0b57cec5SDimitry Andric bool visitICmpInst(ICmpInst &I); 177*0b57cec5SDimitry Andric bool visitSelectInst(SelectInst &I); 178*0b57cec5SDimitry Andric 179*0b57cec5SDimitry Andric bool visitIntrinsicInst(IntrinsicInst &I); 180*0b57cec5SDimitry Andric bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 181*0b57cec5SDimitry Andric 182*0b57cec5SDimitry Andric bool doInitialization(Module &M) override; 183*0b57cec5SDimitry Andric bool runOnFunction(Function &F) override; 184*0b57cec5SDimitry Andric 185*0b57cec5SDimitry Andric StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 186*0b57cec5SDimitry Andric 187*0b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 188*0b57cec5SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 189*0b57cec5SDimitry Andric AU.addRequired<LegacyDivergenceAnalysis>(); 190*0b57cec5SDimitry Andric AU.setPreservesAll(); 191*0b57cec5SDimitry Andric } 192*0b57cec5SDimitry Andric }; 193*0b57cec5SDimitry Andric 194*0b57cec5SDimitry Andric } // end anonymous namespace 195*0b57cec5SDimitry Andric 196*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 197*0b57cec5SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 198*0b57cec5SDimitry Andric 199*0b57cec5SDimitry Andric if (T->isIntegerTy()) 200*0b57cec5SDimitry Andric return T->getIntegerBitWidth(); 201*0b57cec5SDimitry Andric return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 202*0b57cec5SDimitry Andric } 203*0b57cec5SDimitry Andric 204*0b57cec5SDimitry Andric Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 205*0b57cec5SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 206*0b57cec5SDimitry Andric 207*0b57cec5SDimitry Andric if (T->isIntegerTy()) 208*0b57cec5SDimitry Andric return B.getInt32Ty(); 209*0b57cec5SDimitry Andric return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 210*0b57cec5SDimitry Andric } 211*0b57cec5SDimitry Andric 212*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 213*0b57cec5SDimitry Andric return I.getOpcode() == Instruction::AShr || 214*0b57cec5SDimitry Andric I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 215*0b57cec5SDimitry Andric } 216*0b57cec5SDimitry Andric 217*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 218*0b57cec5SDimitry Andric return isa<ICmpInst>(I.getOperand(0)) ? 219*0b57cec5SDimitry Andric cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 220*0b57cec5SDimitry Andric } 221*0b57cec5SDimitry Andric 222*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 223*0b57cec5SDimitry Andric const IntegerType *IntTy = dyn_cast<IntegerType>(T); 224*0b57cec5SDimitry Andric if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 225*0b57cec5SDimitry Andric return true; 226*0b57cec5SDimitry Andric 227*0b57cec5SDimitry Andric if (const VectorType *VT = dyn_cast<VectorType>(T)) { 228*0b57cec5SDimitry Andric // TODO: The set of packed operations is more limited, so may want to 229*0b57cec5SDimitry Andric // promote some anyway. 230*0b57cec5SDimitry Andric if (ST->hasVOP3PInsts()) 231*0b57cec5SDimitry Andric return false; 232*0b57cec5SDimitry Andric 233*0b57cec5SDimitry Andric return needsPromotionToI32(VT->getElementType()); 234*0b57cec5SDimitry Andric } 235*0b57cec5SDimitry Andric 236*0b57cec5SDimitry Andric return false; 237*0b57cec5SDimitry Andric } 238*0b57cec5SDimitry Andric 239*0b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nsw set. 240*0b57cec5SDimitry Andric static bool promotedOpIsNSW(const Instruction &I) { 241*0b57cec5SDimitry Andric switch (I.getOpcode()) { 242*0b57cec5SDimitry Andric case Instruction::Shl: 243*0b57cec5SDimitry Andric case Instruction::Add: 244*0b57cec5SDimitry Andric case Instruction::Sub: 245*0b57cec5SDimitry Andric return true; 246*0b57cec5SDimitry Andric case Instruction::Mul: 247*0b57cec5SDimitry Andric return I.hasNoUnsignedWrap(); 248*0b57cec5SDimitry Andric default: 249*0b57cec5SDimitry Andric return false; 250*0b57cec5SDimitry Andric } 251*0b57cec5SDimitry Andric } 252*0b57cec5SDimitry Andric 253*0b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nuw set. 254*0b57cec5SDimitry Andric static bool promotedOpIsNUW(const Instruction &I) { 255*0b57cec5SDimitry Andric switch (I.getOpcode()) { 256*0b57cec5SDimitry Andric case Instruction::Shl: 257*0b57cec5SDimitry Andric case Instruction::Add: 258*0b57cec5SDimitry Andric case Instruction::Mul: 259*0b57cec5SDimitry Andric return true; 260*0b57cec5SDimitry Andric case Instruction::Sub: 261*0b57cec5SDimitry Andric return I.hasNoUnsignedWrap(); 262*0b57cec5SDimitry Andric default: 263*0b57cec5SDimitry Andric return false; 264*0b57cec5SDimitry Andric } 265*0b57cec5SDimitry Andric } 266*0b57cec5SDimitry Andric 267*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 268*0b57cec5SDimitry Andric Type *Ty = I.getType(); 269*0b57cec5SDimitry Andric const DataLayout &DL = Mod->getDataLayout(); 270*0b57cec5SDimitry Andric int TySize = DL.getTypeSizeInBits(Ty); 271*0b57cec5SDimitry Andric unsigned Align = I.getAlignment() ? 272*0b57cec5SDimitry Andric I.getAlignment() : DL.getABITypeAlignment(Ty); 273*0b57cec5SDimitry Andric 274*0b57cec5SDimitry Andric return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 275*0b57cec5SDimitry Andric } 276*0b57cec5SDimitry Andric 277*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 278*0b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 279*0b57cec5SDimitry Andric "I does not need promotion to i32"); 280*0b57cec5SDimitry Andric 281*0b57cec5SDimitry Andric if (I.getOpcode() == Instruction::SDiv || 282*0b57cec5SDimitry Andric I.getOpcode() == Instruction::UDiv || 283*0b57cec5SDimitry Andric I.getOpcode() == Instruction::SRem || 284*0b57cec5SDimitry Andric I.getOpcode() == Instruction::URem) 285*0b57cec5SDimitry Andric return false; 286*0b57cec5SDimitry Andric 287*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 288*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 289*0b57cec5SDimitry Andric 290*0b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 291*0b57cec5SDimitry Andric Value *ExtOp0 = nullptr; 292*0b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 293*0b57cec5SDimitry Andric Value *ExtRes = nullptr; 294*0b57cec5SDimitry Andric Value *TruncRes = nullptr; 295*0b57cec5SDimitry Andric 296*0b57cec5SDimitry Andric if (isSigned(I)) { 297*0b57cec5SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 298*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 299*0b57cec5SDimitry Andric } else { 300*0b57cec5SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 301*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 302*0b57cec5SDimitry Andric } 303*0b57cec5SDimitry Andric 304*0b57cec5SDimitry Andric ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 305*0b57cec5SDimitry Andric if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 306*0b57cec5SDimitry Andric if (promotedOpIsNSW(cast<Instruction>(I))) 307*0b57cec5SDimitry Andric Inst->setHasNoSignedWrap(); 308*0b57cec5SDimitry Andric 309*0b57cec5SDimitry Andric if (promotedOpIsNUW(cast<Instruction>(I))) 310*0b57cec5SDimitry Andric Inst->setHasNoUnsignedWrap(); 311*0b57cec5SDimitry Andric 312*0b57cec5SDimitry Andric if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 313*0b57cec5SDimitry Andric Inst->setIsExact(ExactOp->isExact()); 314*0b57cec5SDimitry Andric } 315*0b57cec5SDimitry Andric 316*0b57cec5SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 317*0b57cec5SDimitry Andric 318*0b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 319*0b57cec5SDimitry Andric I.eraseFromParent(); 320*0b57cec5SDimitry Andric 321*0b57cec5SDimitry Andric return true; 322*0b57cec5SDimitry Andric } 323*0b57cec5SDimitry Andric 324*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 325*0b57cec5SDimitry Andric assert(needsPromotionToI32(I.getOperand(0)->getType()) && 326*0b57cec5SDimitry Andric "I does not need promotion to i32"); 327*0b57cec5SDimitry Andric 328*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 329*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 330*0b57cec5SDimitry Andric 331*0b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 332*0b57cec5SDimitry Andric Value *ExtOp0 = nullptr; 333*0b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 334*0b57cec5SDimitry Andric Value *NewICmp = nullptr; 335*0b57cec5SDimitry Andric 336*0b57cec5SDimitry Andric if (I.isSigned()) { 337*0b57cec5SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 338*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 339*0b57cec5SDimitry Andric } else { 340*0b57cec5SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 341*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 342*0b57cec5SDimitry Andric } 343*0b57cec5SDimitry Andric NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 344*0b57cec5SDimitry Andric 345*0b57cec5SDimitry Andric I.replaceAllUsesWith(NewICmp); 346*0b57cec5SDimitry Andric I.eraseFromParent(); 347*0b57cec5SDimitry Andric 348*0b57cec5SDimitry Andric return true; 349*0b57cec5SDimitry Andric } 350*0b57cec5SDimitry Andric 351*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 352*0b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 353*0b57cec5SDimitry Andric "I does not need promotion to i32"); 354*0b57cec5SDimitry Andric 355*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 356*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 357*0b57cec5SDimitry Andric 358*0b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 359*0b57cec5SDimitry Andric Value *ExtOp1 = nullptr; 360*0b57cec5SDimitry Andric Value *ExtOp2 = nullptr; 361*0b57cec5SDimitry Andric Value *ExtRes = nullptr; 362*0b57cec5SDimitry Andric Value *TruncRes = nullptr; 363*0b57cec5SDimitry Andric 364*0b57cec5SDimitry Andric if (isSigned(I)) { 365*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 366*0b57cec5SDimitry Andric ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 367*0b57cec5SDimitry Andric } else { 368*0b57cec5SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 369*0b57cec5SDimitry Andric ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 370*0b57cec5SDimitry Andric } 371*0b57cec5SDimitry Andric ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 372*0b57cec5SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 373*0b57cec5SDimitry Andric 374*0b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 375*0b57cec5SDimitry Andric I.eraseFromParent(); 376*0b57cec5SDimitry Andric 377*0b57cec5SDimitry Andric return true; 378*0b57cec5SDimitry Andric } 379*0b57cec5SDimitry Andric 380*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 381*0b57cec5SDimitry Andric IntrinsicInst &I) const { 382*0b57cec5SDimitry Andric assert(I.getIntrinsicID() == Intrinsic::bitreverse && 383*0b57cec5SDimitry Andric "I must be bitreverse intrinsic"); 384*0b57cec5SDimitry Andric assert(needsPromotionToI32(I.getType()) && 385*0b57cec5SDimitry Andric "I does not need promotion to i32"); 386*0b57cec5SDimitry Andric 387*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 388*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 389*0b57cec5SDimitry Andric 390*0b57cec5SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType()); 391*0b57cec5SDimitry Andric Function *I32 = 392*0b57cec5SDimitry Andric Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 393*0b57cec5SDimitry Andric Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 394*0b57cec5SDimitry Andric Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 395*0b57cec5SDimitry Andric Value *LShrOp = 396*0b57cec5SDimitry Andric Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 397*0b57cec5SDimitry Andric Value *TruncRes = 398*0b57cec5SDimitry Andric Builder.CreateTrunc(LShrOp, I.getType()); 399*0b57cec5SDimitry Andric 400*0b57cec5SDimitry Andric I.replaceAllUsesWith(TruncRes); 401*0b57cec5SDimitry Andric I.eraseFromParent(); 402*0b57cec5SDimitry Andric 403*0b57cec5SDimitry Andric return true; 404*0b57cec5SDimitry Andric } 405*0b57cec5SDimitry Andric 406*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, 407*0b57cec5SDimitry Andric unsigned ScalarSize) const { 408*0b57cec5SDimitry Andric KnownBits Known = computeKnownBits(Op, *DL, 0, AC); 409*0b57cec5SDimitry Andric return ScalarSize - Known.countMinLeadingZeros(); 410*0b57cec5SDimitry Andric } 411*0b57cec5SDimitry Andric 412*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, 413*0b57cec5SDimitry Andric unsigned ScalarSize) const { 414*0b57cec5SDimitry Andric // In order for this to be a signed 24-bit value, bit 23, must 415*0b57cec5SDimitry Andric // be a sign bit. 416*0b57cec5SDimitry Andric return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); 417*0b57cec5SDimitry Andric } 418*0b57cec5SDimitry Andric 419*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { 420*0b57cec5SDimitry Andric return ScalarSize >= 24 && // Types less than 24-bit should be treated 421*0b57cec5SDimitry Andric // as unsigned 24-bit values. 422*0b57cec5SDimitry Andric numBitsSigned(V, ScalarSize) < 24; 423*0b57cec5SDimitry Andric } 424*0b57cec5SDimitry Andric 425*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { 426*0b57cec5SDimitry Andric return numBitsUnsigned(V, ScalarSize) <= 24; 427*0b57cec5SDimitry Andric } 428*0b57cec5SDimitry Andric 429*0b57cec5SDimitry Andric static void extractValues(IRBuilder<> &Builder, 430*0b57cec5SDimitry Andric SmallVectorImpl<Value *> &Values, Value *V) { 431*0b57cec5SDimitry Andric VectorType *VT = dyn_cast<VectorType>(V->getType()); 432*0b57cec5SDimitry Andric if (!VT) { 433*0b57cec5SDimitry Andric Values.push_back(V); 434*0b57cec5SDimitry Andric return; 435*0b57cec5SDimitry Andric } 436*0b57cec5SDimitry Andric 437*0b57cec5SDimitry Andric for (int I = 0, E = VT->getNumElements(); I != E; ++I) 438*0b57cec5SDimitry Andric Values.push_back(Builder.CreateExtractElement(V, I)); 439*0b57cec5SDimitry Andric } 440*0b57cec5SDimitry Andric 441*0b57cec5SDimitry Andric static Value *insertValues(IRBuilder<> &Builder, 442*0b57cec5SDimitry Andric Type *Ty, 443*0b57cec5SDimitry Andric SmallVectorImpl<Value *> &Values) { 444*0b57cec5SDimitry Andric if (Values.size() == 1) 445*0b57cec5SDimitry Andric return Values[0]; 446*0b57cec5SDimitry Andric 447*0b57cec5SDimitry Andric Value *NewVal = UndefValue::get(Ty); 448*0b57cec5SDimitry Andric for (int I = 0, E = Values.size(); I != E; ++I) 449*0b57cec5SDimitry Andric NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); 450*0b57cec5SDimitry Andric 451*0b57cec5SDimitry Andric return NewVal; 452*0b57cec5SDimitry Andric } 453*0b57cec5SDimitry Andric 454*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { 455*0b57cec5SDimitry Andric if (I.getOpcode() != Instruction::Mul) 456*0b57cec5SDimitry Andric return false; 457*0b57cec5SDimitry Andric 458*0b57cec5SDimitry Andric Type *Ty = I.getType(); 459*0b57cec5SDimitry Andric unsigned Size = Ty->getScalarSizeInBits(); 460*0b57cec5SDimitry Andric if (Size <= 16 && ST->has16BitInsts()) 461*0b57cec5SDimitry Andric return false; 462*0b57cec5SDimitry Andric 463*0b57cec5SDimitry Andric // Prefer scalar if this could be s_mul_i32 464*0b57cec5SDimitry Andric if (DA->isUniform(&I)) 465*0b57cec5SDimitry Andric return false; 466*0b57cec5SDimitry Andric 467*0b57cec5SDimitry Andric Value *LHS = I.getOperand(0); 468*0b57cec5SDimitry Andric Value *RHS = I.getOperand(1); 469*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 470*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 471*0b57cec5SDimitry Andric 472*0b57cec5SDimitry Andric Intrinsic::ID IntrID = Intrinsic::not_intrinsic; 473*0b57cec5SDimitry Andric 474*0b57cec5SDimitry Andric // TODO: Should this try to match mulhi24? 475*0b57cec5SDimitry Andric if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { 476*0b57cec5SDimitry Andric IntrID = Intrinsic::amdgcn_mul_u24; 477*0b57cec5SDimitry Andric } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { 478*0b57cec5SDimitry Andric IntrID = Intrinsic::amdgcn_mul_i24; 479*0b57cec5SDimitry Andric } else 480*0b57cec5SDimitry Andric return false; 481*0b57cec5SDimitry Andric 482*0b57cec5SDimitry Andric SmallVector<Value *, 4> LHSVals; 483*0b57cec5SDimitry Andric SmallVector<Value *, 4> RHSVals; 484*0b57cec5SDimitry Andric SmallVector<Value *, 4> ResultVals; 485*0b57cec5SDimitry Andric extractValues(Builder, LHSVals, LHS); 486*0b57cec5SDimitry Andric extractValues(Builder, RHSVals, RHS); 487*0b57cec5SDimitry Andric 488*0b57cec5SDimitry Andric 489*0b57cec5SDimitry Andric IntegerType *I32Ty = Builder.getInt32Ty(); 490*0b57cec5SDimitry Andric FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); 491*0b57cec5SDimitry Andric for (int I = 0, E = LHSVals.size(); I != E; ++I) { 492*0b57cec5SDimitry Andric Value *LHS, *RHS; 493*0b57cec5SDimitry Andric if (IntrID == Intrinsic::amdgcn_mul_u24) { 494*0b57cec5SDimitry Andric LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); 495*0b57cec5SDimitry Andric RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); 496*0b57cec5SDimitry Andric } else { 497*0b57cec5SDimitry Andric LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); 498*0b57cec5SDimitry Andric RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); 499*0b57cec5SDimitry Andric } 500*0b57cec5SDimitry Andric 501*0b57cec5SDimitry Andric Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); 502*0b57cec5SDimitry Andric 503*0b57cec5SDimitry Andric if (IntrID == Intrinsic::amdgcn_mul_u24) { 504*0b57cec5SDimitry Andric ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, 505*0b57cec5SDimitry Andric LHSVals[I]->getType())); 506*0b57cec5SDimitry Andric } else { 507*0b57cec5SDimitry Andric ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, 508*0b57cec5SDimitry Andric LHSVals[I]->getType())); 509*0b57cec5SDimitry Andric } 510*0b57cec5SDimitry Andric } 511*0b57cec5SDimitry Andric 512*0b57cec5SDimitry Andric I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); 513*0b57cec5SDimitry Andric I.eraseFromParent(); 514*0b57cec5SDimitry Andric 515*0b57cec5SDimitry Andric return true; 516*0b57cec5SDimitry Andric } 517*0b57cec5SDimitry Andric 518*0b57cec5SDimitry Andric static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 519*0b57cec5SDimitry Andric const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 520*0b57cec5SDimitry Andric if (!CNum) 521*0b57cec5SDimitry Andric return HasDenormals; 522*0b57cec5SDimitry Andric 523*0b57cec5SDimitry Andric if (UnsafeDiv) 524*0b57cec5SDimitry Andric return true; 525*0b57cec5SDimitry Andric 526*0b57cec5SDimitry Andric bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 527*0b57cec5SDimitry Andric 528*0b57cec5SDimitry Andric // Reciprocal f32 is handled separately without denormals. 529*0b57cec5SDimitry Andric return HasDenormals ^ IsOne; 530*0b57cec5SDimitry Andric } 531*0b57cec5SDimitry Andric 532*0b57cec5SDimitry Andric // Insert an intrinsic for fast fdiv for safe math situations where we can 533*0b57cec5SDimitry Andric // reduce precision. Leave fdiv for situations where the generic node is 534*0b57cec5SDimitry Andric // expected to be optimized. 535*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 536*0b57cec5SDimitry Andric Type *Ty = FDiv.getType(); 537*0b57cec5SDimitry Andric 538*0b57cec5SDimitry Andric if (!Ty->getScalarType()->isFloatTy()) 539*0b57cec5SDimitry Andric return false; 540*0b57cec5SDimitry Andric 541*0b57cec5SDimitry Andric MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 542*0b57cec5SDimitry Andric if (!FPMath) 543*0b57cec5SDimitry Andric return false; 544*0b57cec5SDimitry Andric 545*0b57cec5SDimitry Andric const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 546*0b57cec5SDimitry Andric float ULP = FPOp->getFPAccuracy(); 547*0b57cec5SDimitry Andric if (ULP < 2.5f) 548*0b57cec5SDimitry Andric return false; 549*0b57cec5SDimitry Andric 550*0b57cec5SDimitry Andric FastMathFlags FMF = FPOp->getFastMathFlags(); 551*0b57cec5SDimitry Andric bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 552*0b57cec5SDimitry Andric FMF.allowReciprocal(); 553*0b57cec5SDimitry Andric 554*0b57cec5SDimitry Andric // With UnsafeDiv node will be optimized to just rcp and mul. 555*0b57cec5SDimitry Andric if (UnsafeDiv) 556*0b57cec5SDimitry Andric return false; 557*0b57cec5SDimitry Andric 558*0b57cec5SDimitry Andric IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 559*0b57cec5SDimitry Andric Builder.setFastMathFlags(FMF); 560*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 561*0b57cec5SDimitry Andric 562*0b57cec5SDimitry Andric Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 563*0b57cec5SDimitry Andric 564*0b57cec5SDimitry Andric Value *Num = FDiv.getOperand(0); 565*0b57cec5SDimitry Andric Value *Den = FDiv.getOperand(1); 566*0b57cec5SDimitry Andric 567*0b57cec5SDimitry Andric Value *NewFDiv = nullptr; 568*0b57cec5SDimitry Andric 569*0b57cec5SDimitry Andric bool HasDenormals = ST->hasFP32Denormals(); 570*0b57cec5SDimitry Andric if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 571*0b57cec5SDimitry Andric NewFDiv = UndefValue::get(VT); 572*0b57cec5SDimitry Andric 573*0b57cec5SDimitry Andric // FIXME: Doesn't do the right thing for cases where the vector is partially 574*0b57cec5SDimitry Andric // constant. This works when the scalarizer pass is run first. 575*0b57cec5SDimitry Andric for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 576*0b57cec5SDimitry Andric Value *NumEltI = Builder.CreateExtractElement(Num, I); 577*0b57cec5SDimitry Andric Value *DenEltI = Builder.CreateExtractElement(Den, I); 578*0b57cec5SDimitry Andric Value *NewElt; 579*0b57cec5SDimitry Andric 580*0b57cec5SDimitry Andric if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 581*0b57cec5SDimitry Andric NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 582*0b57cec5SDimitry Andric } else { 583*0b57cec5SDimitry Andric NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 584*0b57cec5SDimitry Andric } 585*0b57cec5SDimitry Andric 586*0b57cec5SDimitry Andric NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 587*0b57cec5SDimitry Andric } 588*0b57cec5SDimitry Andric } else { 589*0b57cec5SDimitry Andric if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 590*0b57cec5SDimitry Andric NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 591*0b57cec5SDimitry Andric } 592*0b57cec5SDimitry Andric 593*0b57cec5SDimitry Andric if (NewFDiv) { 594*0b57cec5SDimitry Andric FDiv.replaceAllUsesWith(NewFDiv); 595*0b57cec5SDimitry Andric NewFDiv->takeName(&FDiv); 596*0b57cec5SDimitry Andric FDiv.eraseFromParent(); 597*0b57cec5SDimitry Andric } 598*0b57cec5SDimitry Andric 599*0b57cec5SDimitry Andric return !!NewFDiv; 600*0b57cec5SDimitry Andric } 601*0b57cec5SDimitry Andric 602*0b57cec5SDimitry Andric static bool hasUnsafeFPMath(const Function &F) { 603*0b57cec5SDimitry Andric Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 604*0b57cec5SDimitry Andric return Attr.getValueAsString() == "true"; 605*0b57cec5SDimitry Andric } 606*0b57cec5SDimitry Andric 607*0b57cec5SDimitry Andric static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 608*0b57cec5SDimitry Andric Value *LHS, Value *RHS) { 609*0b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 610*0b57cec5SDimitry Andric Type *I64Ty = Builder.getInt64Ty(); 611*0b57cec5SDimitry Andric 612*0b57cec5SDimitry Andric Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 613*0b57cec5SDimitry Andric Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 614*0b57cec5SDimitry Andric Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 615*0b57cec5SDimitry Andric Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 616*0b57cec5SDimitry Andric Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 617*0b57cec5SDimitry Andric Hi = Builder.CreateTrunc(Hi, I32Ty); 618*0b57cec5SDimitry Andric return std::make_pair(Lo, Hi); 619*0b57cec5SDimitry Andric } 620*0b57cec5SDimitry Andric 621*0b57cec5SDimitry Andric static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 622*0b57cec5SDimitry Andric return getMul64(Builder, LHS, RHS).second; 623*0b57cec5SDimitry Andric } 624*0b57cec5SDimitry Andric 625*0b57cec5SDimitry Andric // The fractional part of a float is enough to accurately represent up to 626*0b57cec5SDimitry Andric // a 24-bit signed integer. 627*0b57cec5SDimitry Andric Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 628*0b57cec5SDimitry Andric BinaryOperator &I, 629*0b57cec5SDimitry Andric Value *Num, Value *Den, 630*0b57cec5SDimitry Andric bool IsDiv, bool IsSigned) const { 631*0b57cec5SDimitry Andric assert(Num->getType()->isIntegerTy(32)); 632*0b57cec5SDimitry Andric 633*0b57cec5SDimitry Andric const DataLayout &DL = Mod->getDataLayout(); 634*0b57cec5SDimitry Andric unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 635*0b57cec5SDimitry Andric if (LHSSignBits < 9) 636*0b57cec5SDimitry Andric return nullptr; 637*0b57cec5SDimitry Andric 638*0b57cec5SDimitry Andric unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 639*0b57cec5SDimitry Andric if (RHSSignBits < 9) 640*0b57cec5SDimitry Andric return nullptr; 641*0b57cec5SDimitry Andric 642*0b57cec5SDimitry Andric 643*0b57cec5SDimitry Andric unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 644*0b57cec5SDimitry Andric unsigned DivBits = 32 - SignBits; 645*0b57cec5SDimitry Andric if (IsSigned) 646*0b57cec5SDimitry Andric ++DivBits; 647*0b57cec5SDimitry Andric 648*0b57cec5SDimitry Andric Type *Ty = Num->getType(); 649*0b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 650*0b57cec5SDimitry Andric Type *F32Ty = Builder.getFloatTy(); 651*0b57cec5SDimitry Andric ConstantInt *One = Builder.getInt32(1); 652*0b57cec5SDimitry Andric Value *JQ = One; 653*0b57cec5SDimitry Andric 654*0b57cec5SDimitry Andric if (IsSigned) { 655*0b57cec5SDimitry Andric // char|short jq = ia ^ ib; 656*0b57cec5SDimitry Andric JQ = Builder.CreateXor(Num, Den); 657*0b57cec5SDimitry Andric 658*0b57cec5SDimitry Andric // jq = jq >> (bitsize - 2) 659*0b57cec5SDimitry Andric JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 660*0b57cec5SDimitry Andric 661*0b57cec5SDimitry Andric // jq = jq | 0x1 662*0b57cec5SDimitry Andric JQ = Builder.CreateOr(JQ, One); 663*0b57cec5SDimitry Andric } 664*0b57cec5SDimitry Andric 665*0b57cec5SDimitry Andric // int ia = (int)LHS; 666*0b57cec5SDimitry Andric Value *IA = Num; 667*0b57cec5SDimitry Andric 668*0b57cec5SDimitry Andric // int ib, (int)RHS; 669*0b57cec5SDimitry Andric Value *IB = Den; 670*0b57cec5SDimitry Andric 671*0b57cec5SDimitry Andric // float fa = (float)ia; 672*0b57cec5SDimitry Andric Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 673*0b57cec5SDimitry Andric : Builder.CreateUIToFP(IA, F32Ty); 674*0b57cec5SDimitry Andric 675*0b57cec5SDimitry Andric // float fb = (float)ib; 676*0b57cec5SDimitry Andric Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 677*0b57cec5SDimitry Andric : Builder.CreateUIToFP(IB,F32Ty); 678*0b57cec5SDimitry Andric 679*0b57cec5SDimitry Andric Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); 680*0b57cec5SDimitry Andric Value *FQM = Builder.CreateFMul(FA, RCP); 681*0b57cec5SDimitry Andric 682*0b57cec5SDimitry Andric // fq = trunc(fqm); 683*0b57cec5SDimitry Andric CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 684*0b57cec5SDimitry Andric FQ->copyFastMathFlags(Builder.getFastMathFlags()); 685*0b57cec5SDimitry Andric 686*0b57cec5SDimitry Andric // float fqneg = -fq; 687*0b57cec5SDimitry Andric Value *FQNeg = Builder.CreateFNeg(FQ); 688*0b57cec5SDimitry Andric 689*0b57cec5SDimitry Andric // float fr = mad(fqneg, fb, fa); 690*0b57cec5SDimitry Andric Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 691*0b57cec5SDimitry Andric {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 692*0b57cec5SDimitry Andric 693*0b57cec5SDimitry Andric // int iq = (int)fq; 694*0b57cec5SDimitry Andric Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 695*0b57cec5SDimitry Andric : Builder.CreateFPToUI(FQ, I32Ty); 696*0b57cec5SDimitry Andric 697*0b57cec5SDimitry Andric // fr = fabs(fr); 698*0b57cec5SDimitry Andric FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 699*0b57cec5SDimitry Andric 700*0b57cec5SDimitry Andric // fb = fabs(fb); 701*0b57cec5SDimitry Andric FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 702*0b57cec5SDimitry Andric 703*0b57cec5SDimitry Andric // int cv = fr >= fb; 704*0b57cec5SDimitry Andric Value *CV = Builder.CreateFCmpOGE(FR, FB); 705*0b57cec5SDimitry Andric 706*0b57cec5SDimitry Andric // jq = (cv ? jq : 0); 707*0b57cec5SDimitry Andric JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 708*0b57cec5SDimitry Andric 709*0b57cec5SDimitry Andric // dst = iq + jq; 710*0b57cec5SDimitry Andric Value *Div = Builder.CreateAdd(IQ, JQ); 711*0b57cec5SDimitry Andric 712*0b57cec5SDimitry Andric Value *Res = Div; 713*0b57cec5SDimitry Andric if (!IsDiv) { 714*0b57cec5SDimitry Andric // Rem needs compensation, it's easier to recompute it 715*0b57cec5SDimitry Andric Value *Rem = Builder.CreateMul(Div, Den); 716*0b57cec5SDimitry Andric Res = Builder.CreateSub(Num, Rem); 717*0b57cec5SDimitry Andric } 718*0b57cec5SDimitry Andric 719*0b57cec5SDimitry Andric // Truncate to number of bits this divide really is. 720*0b57cec5SDimitry Andric if (IsSigned) { 721*0b57cec5SDimitry Andric Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); 722*0b57cec5SDimitry Andric Res = Builder.CreateSExt(Res, Ty); 723*0b57cec5SDimitry Andric } else { 724*0b57cec5SDimitry Andric ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 725*0b57cec5SDimitry Andric Res = Builder.CreateAnd(Res, TruncMask); 726*0b57cec5SDimitry Andric } 727*0b57cec5SDimitry Andric 728*0b57cec5SDimitry Andric return Res; 729*0b57cec5SDimitry Andric } 730*0b57cec5SDimitry Andric 731*0b57cec5SDimitry Andric Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 732*0b57cec5SDimitry Andric BinaryOperator &I, 733*0b57cec5SDimitry Andric Value *Num, Value *Den) const { 734*0b57cec5SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 735*0b57cec5SDimitry Andric assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 736*0b57cec5SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv); 737*0b57cec5SDimitry Andric 738*0b57cec5SDimitry Andric FastMathFlags FMF; 739*0b57cec5SDimitry Andric FMF.setFast(); 740*0b57cec5SDimitry Andric Builder.setFastMathFlags(FMF); 741*0b57cec5SDimitry Andric 742*0b57cec5SDimitry Andric if (isa<Constant>(Den)) 743*0b57cec5SDimitry Andric return nullptr; // Keep it for optimization 744*0b57cec5SDimitry Andric 745*0b57cec5SDimitry Andric bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 746*0b57cec5SDimitry Andric bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 747*0b57cec5SDimitry Andric 748*0b57cec5SDimitry Andric Type *Ty = Num->getType(); 749*0b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 750*0b57cec5SDimitry Andric Type *F32Ty = Builder.getFloatTy(); 751*0b57cec5SDimitry Andric 752*0b57cec5SDimitry Andric if (Ty->getScalarSizeInBits() < 32) { 753*0b57cec5SDimitry Andric if (IsSigned) { 754*0b57cec5SDimitry Andric Num = Builder.CreateSExt(Num, I32Ty); 755*0b57cec5SDimitry Andric Den = Builder.CreateSExt(Den, I32Ty); 756*0b57cec5SDimitry Andric } else { 757*0b57cec5SDimitry Andric Num = Builder.CreateZExt(Num, I32Ty); 758*0b57cec5SDimitry Andric Den = Builder.CreateZExt(Den, I32Ty); 759*0b57cec5SDimitry Andric } 760*0b57cec5SDimitry Andric } 761*0b57cec5SDimitry Andric 762*0b57cec5SDimitry Andric if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 763*0b57cec5SDimitry Andric Res = Builder.CreateTrunc(Res, Ty); 764*0b57cec5SDimitry Andric return Res; 765*0b57cec5SDimitry Andric } 766*0b57cec5SDimitry Andric 767*0b57cec5SDimitry Andric ConstantInt *Zero = Builder.getInt32(0); 768*0b57cec5SDimitry Andric ConstantInt *One = Builder.getInt32(1); 769*0b57cec5SDimitry Andric ConstantInt *MinusOne = Builder.getInt32(~0); 770*0b57cec5SDimitry Andric 771*0b57cec5SDimitry Andric Value *Sign = nullptr; 772*0b57cec5SDimitry Andric if (IsSigned) { 773*0b57cec5SDimitry Andric ConstantInt *K31 = Builder.getInt32(31); 774*0b57cec5SDimitry Andric Value *LHSign = Builder.CreateAShr(Num, K31); 775*0b57cec5SDimitry Andric Value *RHSign = Builder.CreateAShr(Den, K31); 776*0b57cec5SDimitry Andric // Remainder sign is the same as LHS 777*0b57cec5SDimitry Andric Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 778*0b57cec5SDimitry Andric 779*0b57cec5SDimitry Andric Num = Builder.CreateAdd(Num, LHSign); 780*0b57cec5SDimitry Andric Den = Builder.CreateAdd(Den, RHSign); 781*0b57cec5SDimitry Andric 782*0b57cec5SDimitry Andric Num = Builder.CreateXor(Num, LHSign); 783*0b57cec5SDimitry Andric Den = Builder.CreateXor(Den, RHSign); 784*0b57cec5SDimitry Andric } 785*0b57cec5SDimitry Andric 786*0b57cec5SDimitry Andric // RCP = URECIP(Den) = 2^32 / Den + e 787*0b57cec5SDimitry Andric // e is rounding error. 788*0b57cec5SDimitry Andric Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 789*0b57cec5SDimitry Andric Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); 790*0b57cec5SDimitry Andric Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 791*0b57cec5SDimitry Andric Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 792*0b57cec5SDimitry Andric Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 793*0b57cec5SDimitry Andric 794*0b57cec5SDimitry Andric // RCP_LO, RCP_HI = mul(RCP, Den) */ 795*0b57cec5SDimitry Andric Value *RCP_LO, *RCP_HI; 796*0b57cec5SDimitry Andric std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 797*0b57cec5SDimitry Andric 798*0b57cec5SDimitry Andric // NEG_RCP_LO = -RCP_LO 799*0b57cec5SDimitry Andric Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 800*0b57cec5SDimitry Andric 801*0b57cec5SDimitry Andric // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 802*0b57cec5SDimitry Andric Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 803*0b57cec5SDimitry Andric Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 804*0b57cec5SDimitry Andric 805*0b57cec5SDimitry Andric // Calculate the rounding error from the URECIP instruction 806*0b57cec5SDimitry Andric // E = mulhu(ABS_RCP_LO, RCP) 807*0b57cec5SDimitry Andric Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 808*0b57cec5SDimitry Andric 809*0b57cec5SDimitry Andric // RCP_A_E = RCP + E 810*0b57cec5SDimitry Andric Value *RCP_A_E = Builder.CreateAdd(RCP, E); 811*0b57cec5SDimitry Andric 812*0b57cec5SDimitry Andric // RCP_S_E = RCP - E 813*0b57cec5SDimitry Andric Value *RCP_S_E = Builder.CreateSub(RCP, E); 814*0b57cec5SDimitry Andric 815*0b57cec5SDimitry Andric // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 816*0b57cec5SDimitry Andric Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 817*0b57cec5SDimitry Andric 818*0b57cec5SDimitry Andric // Quotient = mulhu(Tmp0, Num) 819*0b57cec5SDimitry Andric Value *Quotient = getMulHu(Builder, Tmp0, Num); 820*0b57cec5SDimitry Andric 821*0b57cec5SDimitry Andric // Num_S_Remainder = Quotient * Den 822*0b57cec5SDimitry Andric Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 823*0b57cec5SDimitry Andric 824*0b57cec5SDimitry Andric // Remainder = Num - Num_S_Remainder 825*0b57cec5SDimitry Andric Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 826*0b57cec5SDimitry Andric 827*0b57cec5SDimitry Andric // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 828*0b57cec5SDimitry Andric Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); 829*0b57cec5SDimitry Andric Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); 830*0b57cec5SDimitry Andric 831*0b57cec5SDimitry Andric // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 832*0b57cec5SDimitry Andric Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); 833*0b57cec5SDimitry Andric Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, 834*0b57cec5SDimitry Andric MinusOne, Zero); 835*0b57cec5SDimitry Andric 836*0b57cec5SDimitry Andric // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 837*0b57cec5SDimitry Andric Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 838*0b57cec5SDimitry Andric Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); 839*0b57cec5SDimitry Andric 840*0b57cec5SDimitry Andric Value *Res; 841*0b57cec5SDimitry Andric if (IsDiv) { 842*0b57cec5SDimitry Andric // Quotient_A_One = Quotient + 1 843*0b57cec5SDimitry Andric Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 844*0b57cec5SDimitry Andric 845*0b57cec5SDimitry Andric // Quotient_S_One = Quotient - 1 846*0b57cec5SDimitry Andric Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 847*0b57cec5SDimitry Andric 848*0b57cec5SDimitry Andric // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 849*0b57cec5SDimitry Andric Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); 850*0b57cec5SDimitry Andric 851*0b57cec5SDimitry Andric // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 852*0b57cec5SDimitry Andric Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); 853*0b57cec5SDimitry Andric } else { 854*0b57cec5SDimitry Andric // Remainder_S_Den = Remainder - Den 855*0b57cec5SDimitry Andric Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 856*0b57cec5SDimitry Andric 857*0b57cec5SDimitry Andric // Remainder_A_Den = Remainder + Den 858*0b57cec5SDimitry Andric Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 859*0b57cec5SDimitry Andric 860*0b57cec5SDimitry Andric // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 861*0b57cec5SDimitry Andric Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); 862*0b57cec5SDimitry Andric 863*0b57cec5SDimitry Andric // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 864*0b57cec5SDimitry Andric Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); 865*0b57cec5SDimitry Andric } 866*0b57cec5SDimitry Andric 867*0b57cec5SDimitry Andric if (IsSigned) { 868*0b57cec5SDimitry Andric Res = Builder.CreateXor(Res, Sign); 869*0b57cec5SDimitry Andric Res = Builder.CreateSub(Res, Sign); 870*0b57cec5SDimitry Andric } 871*0b57cec5SDimitry Andric 872*0b57cec5SDimitry Andric Res = Builder.CreateTrunc(Res, Ty); 873*0b57cec5SDimitry Andric 874*0b57cec5SDimitry Andric return Res; 875*0b57cec5SDimitry Andric } 876*0b57cec5SDimitry Andric 877*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 878*0b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 879*0b57cec5SDimitry Andric DA->isUniform(&I) && promoteUniformOpToI32(I)) 880*0b57cec5SDimitry Andric return true; 881*0b57cec5SDimitry Andric 882*0b57cec5SDimitry Andric if (replaceMulWithMul24(I)) 883*0b57cec5SDimitry Andric return true; 884*0b57cec5SDimitry Andric 885*0b57cec5SDimitry Andric bool Changed = false; 886*0b57cec5SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode(); 887*0b57cec5SDimitry Andric Type *Ty = I.getType(); 888*0b57cec5SDimitry Andric Value *NewDiv = nullptr; 889*0b57cec5SDimitry Andric if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 890*0b57cec5SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv) && 891*0b57cec5SDimitry Andric Ty->getScalarSizeInBits() <= 32) { 892*0b57cec5SDimitry Andric Value *Num = I.getOperand(0); 893*0b57cec5SDimitry Andric Value *Den = I.getOperand(1); 894*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 895*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 896*0b57cec5SDimitry Andric 897*0b57cec5SDimitry Andric if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 898*0b57cec5SDimitry Andric NewDiv = UndefValue::get(VT); 899*0b57cec5SDimitry Andric 900*0b57cec5SDimitry Andric for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 901*0b57cec5SDimitry Andric Value *NumEltN = Builder.CreateExtractElement(Num, N); 902*0b57cec5SDimitry Andric Value *DenEltN = Builder.CreateExtractElement(Den, N); 903*0b57cec5SDimitry Andric Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 904*0b57cec5SDimitry Andric if (!NewElt) 905*0b57cec5SDimitry Andric NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 906*0b57cec5SDimitry Andric NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 907*0b57cec5SDimitry Andric } 908*0b57cec5SDimitry Andric } else { 909*0b57cec5SDimitry Andric NewDiv = expandDivRem32(Builder, I, Num, Den); 910*0b57cec5SDimitry Andric } 911*0b57cec5SDimitry Andric 912*0b57cec5SDimitry Andric if (NewDiv) { 913*0b57cec5SDimitry Andric I.replaceAllUsesWith(NewDiv); 914*0b57cec5SDimitry Andric I.eraseFromParent(); 915*0b57cec5SDimitry Andric Changed = true; 916*0b57cec5SDimitry Andric } 917*0b57cec5SDimitry Andric } 918*0b57cec5SDimitry Andric 919*0b57cec5SDimitry Andric return Changed; 920*0b57cec5SDimitry Andric } 921*0b57cec5SDimitry Andric 922*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 923*0b57cec5SDimitry Andric if (!WidenLoads) 924*0b57cec5SDimitry Andric return false; 925*0b57cec5SDimitry Andric 926*0b57cec5SDimitry Andric if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 927*0b57cec5SDimitry Andric I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 928*0b57cec5SDimitry Andric canWidenScalarExtLoad(I)) { 929*0b57cec5SDimitry Andric IRBuilder<> Builder(&I); 930*0b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc()); 931*0b57cec5SDimitry Andric 932*0b57cec5SDimitry Andric Type *I32Ty = Builder.getInt32Ty(); 933*0b57cec5SDimitry Andric Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 934*0b57cec5SDimitry Andric Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 935*0b57cec5SDimitry Andric LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); 936*0b57cec5SDimitry Andric WidenLoad->copyMetadata(I); 937*0b57cec5SDimitry Andric 938*0b57cec5SDimitry Andric // If we have range metadata, we need to convert the type, and not make 939*0b57cec5SDimitry Andric // assumptions about the high bits. 940*0b57cec5SDimitry Andric if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 941*0b57cec5SDimitry Andric ConstantInt *Lower = 942*0b57cec5SDimitry Andric mdconst::extract<ConstantInt>(Range->getOperand(0)); 943*0b57cec5SDimitry Andric 944*0b57cec5SDimitry Andric if (Lower->getValue().isNullValue()) { 945*0b57cec5SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 946*0b57cec5SDimitry Andric } else { 947*0b57cec5SDimitry Andric Metadata *LowAndHigh[] = { 948*0b57cec5SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 949*0b57cec5SDimitry Andric // Don't make assumptions about the high bits. 950*0b57cec5SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 951*0b57cec5SDimitry Andric }; 952*0b57cec5SDimitry Andric 953*0b57cec5SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range, 954*0b57cec5SDimitry Andric MDNode::get(Mod->getContext(), LowAndHigh)); 955*0b57cec5SDimitry Andric } 956*0b57cec5SDimitry Andric } 957*0b57cec5SDimitry Andric 958*0b57cec5SDimitry Andric int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 959*0b57cec5SDimitry Andric Type *IntNTy = Builder.getIntNTy(TySize); 960*0b57cec5SDimitry Andric Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 961*0b57cec5SDimitry Andric Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 962*0b57cec5SDimitry Andric I.replaceAllUsesWith(ValOrig); 963*0b57cec5SDimitry Andric I.eraseFromParent(); 964*0b57cec5SDimitry Andric return true; 965*0b57cec5SDimitry Andric } 966*0b57cec5SDimitry Andric 967*0b57cec5SDimitry Andric return false; 968*0b57cec5SDimitry Andric } 969*0b57cec5SDimitry Andric 970*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 971*0b57cec5SDimitry Andric bool Changed = false; 972*0b57cec5SDimitry Andric 973*0b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 974*0b57cec5SDimitry Andric DA->isUniform(&I)) 975*0b57cec5SDimitry Andric Changed |= promoteUniformOpToI32(I); 976*0b57cec5SDimitry Andric 977*0b57cec5SDimitry Andric return Changed; 978*0b57cec5SDimitry Andric } 979*0b57cec5SDimitry Andric 980*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 981*0b57cec5SDimitry Andric bool Changed = false; 982*0b57cec5SDimitry Andric 983*0b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 984*0b57cec5SDimitry Andric DA->isUniform(&I)) 985*0b57cec5SDimitry Andric Changed |= promoteUniformOpToI32(I); 986*0b57cec5SDimitry Andric 987*0b57cec5SDimitry Andric return Changed; 988*0b57cec5SDimitry Andric } 989*0b57cec5SDimitry Andric 990*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 991*0b57cec5SDimitry Andric switch (I.getIntrinsicID()) { 992*0b57cec5SDimitry Andric case Intrinsic::bitreverse: 993*0b57cec5SDimitry Andric return visitBitreverseIntrinsicInst(I); 994*0b57cec5SDimitry Andric default: 995*0b57cec5SDimitry Andric return false; 996*0b57cec5SDimitry Andric } 997*0b57cec5SDimitry Andric } 998*0b57cec5SDimitry Andric 999*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 1000*0b57cec5SDimitry Andric bool Changed = false; 1001*0b57cec5SDimitry Andric 1002*0b57cec5SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1003*0b57cec5SDimitry Andric DA->isUniform(&I)) 1004*0b57cec5SDimitry Andric Changed |= promoteUniformBitreverseToI32(I); 1005*0b57cec5SDimitry Andric 1006*0b57cec5SDimitry Andric return Changed; 1007*0b57cec5SDimitry Andric } 1008*0b57cec5SDimitry Andric 1009*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 1010*0b57cec5SDimitry Andric Mod = &M; 1011*0b57cec5SDimitry Andric DL = &Mod->getDataLayout(); 1012*0b57cec5SDimitry Andric return false; 1013*0b57cec5SDimitry Andric } 1014*0b57cec5SDimitry Andric 1015*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 1016*0b57cec5SDimitry Andric if (skipFunction(F)) 1017*0b57cec5SDimitry Andric return false; 1018*0b57cec5SDimitry Andric 1019*0b57cec5SDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 1020*0b57cec5SDimitry Andric if (!TPC) 1021*0b57cec5SDimitry Andric return false; 1022*0b57cec5SDimitry Andric 1023*0b57cec5SDimitry Andric const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 1024*0b57cec5SDimitry Andric ST = &TM.getSubtarget<GCNSubtarget>(F); 1025*0b57cec5SDimitry Andric AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1026*0b57cec5SDimitry Andric DA = &getAnalysis<LegacyDivergenceAnalysis>(); 1027*0b57cec5SDimitry Andric HasUnsafeFPMath = hasUnsafeFPMath(F); 1028*0b57cec5SDimitry Andric 1029*0b57cec5SDimitry Andric bool MadeChange = false; 1030*0b57cec5SDimitry Andric 1031*0b57cec5SDimitry Andric for (BasicBlock &BB : F) { 1032*0b57cec5SDimitry Andric BasicBlock::iterator Next; 1033*0b57cec5SDimitry Andric for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 1034*0b57cec5SDimitry Andric Next = std::next(I); 1035*0b57cec5SDimitry Andric MadeChange |= visit(*I); 1036*0b57cec5SDimitry Andric } 1037*0b57cec5SDimitry Andric } 1038*0b57cec5SDimitry Andric 1039*0b57cec5SDimitry Andric return MadeChange; 1040*0b57cec5SDimitry Andric } 1041*0b57cec5SDimitry Andric 1042*0b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 1043*0b57cec5SDimitry Andric "AMDGPU IR optimizations", false, false) 1044*0b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1045*0b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 1046*0b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 1047*0b57cec5SDimitry Andric false, false) 1048*0b57cec5SDimitry Andric 1049*0b57cec5SDimitry Andric char AMDGPUCodeGenPrepare::ID = 0; 1050*0b57cec5SDimitry Andric 1051*0b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 1052*0b57cec5SDimitry Andric return new AMDGPUCodeGenPrepare(); 1053*0b57cec5SDimitry Andric } 1054