xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1*0b57cec5SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2*0b57cec5SDimitry Andric //
3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0b57cec5SDimitry Andric //
7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
8*0b57cec5SDimitry Andric //
9*0b57cec5SDimitry Andric /// \file
10*0b57cec5SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR before instruction
11*0b57cec5SDimitry Andric /// selection.
12*0b57cec5SDimitry Andric //
13*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
14*0b57cec5SDimitry Andric 
15*0b57cec5SDimitry Andric #include "AMDGPU.h"
16*0b57cec5SDimitry Andric #include "AMDGPUSubtarget.h"
17*0b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
18*0b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
19*0b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
20*0b57cec5SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21*0b57cec5SDimitry Andric #include "llvm/Analysis/Loads.h"
22*0b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
23*0b57cec5SDimitry Andric #include "llvm/CodeGen/Passes.h"
24*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
25*0b57cec5SDimitry Andric #include "llvm/IR/Attributes.h"
26*0b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
27*0b57cec5SDimitry Andric #include "llvm/IR/Constants.h"
28*0b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
29*0b57cec5SDimitry Andric #include "llvm/IR/Function.h"
30*0b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
31*0b57cec5SDimitry Andric #include "llvm/IR/InstVisitor.h"
32*0b57cec5SDimitry Andric #include "llvm/IR/InstrTypes.h"
33*0b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
34*0b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
35*0b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
36*0b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h"
37*0b57cec5SDimitry Andric #include "llvm/IR/LLVMContext.h"
38*0b57cec5SDimitry Andric #include "llvm/IR/Operator.h"
39*0b57cec5SDimitry Andric #include "llvm/IR/Type.h"
40*0b57cec5SDimitry Andric #include "llvm/IR/Value.h"
41*0b57cec5SDimitry Andric #include "llvm/Pass.h"
42*0b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
43*0b57cec5SDimitry Andric #include <cassert>
44*0b57cec5SDimitry Andric #include <iterator>
45*0b57cec5SDimitry Andric 
46*0b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-codegenprepare"
47*0b57cec5SDimitry Andric 
48*0b57cec5SDimitry Andric using namespace llvm;
49*0b57cec5SDimitry Andric 
50*0b57cec5SDimitry Andric namespace {
51*0b57cec5SDimitry Andric 
52*0b57cec5SDimitry Andric static cl::opt<bool> WidenLoads(
53*0b57cec5SDimitry Andric   "amdgpu-codegenprepare-widen-constant-loads",
54*0b57cec5SDimitry Andric   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
55*0b57cec5SDimitry Andric   cl::ReallyHidden,
56*0b57cec5SDimitry Andric   cl::init(true));
57*0b57cec5SDimitry Andric 
58*0b57cec5SDimitry Andric class AMDGPUCodeGenPrepare : public FunctionPass,
59*0b57cec5SDimitry Andric                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
60*0b57cec5SDimitry Andric   const GCNSubtarget *ST = nullptr;
61*0b57cec5SDimitry Andric   AssumptionCache *AC = nullptr;
62*0b57cec5SDimitry Andric   LegacyDivergenceAnalysis *DA = nullptr;
63*0b57cec5SDimitry Andric   Module *Mod = nullptr;
64*0b57cec5SDimitry Andric   const DataLayout *DL = nullptr;
65*0b57cec5SDimitry Andric   bool HasUnsafeFPMath = false;
66*0b57cec5SDimitry Andric 
67*0b57cec5SDimitry Andric   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
68*0b57cec5SDimitry Andric   /// binary operation \p V.
69*0b57cec5SDimitry Andric   ///
70*0b57cec5SDimitry Andric   /// \returns Binary operation \p V.
71*0b57cec5SDimitry Andric   /// \returns \p T's base element bit width.
72*0b57cec5SDimitry Andric   unsigned getBaseElementBitWidth(const Type *T) const;
73*0b57cec5SDimitry Andric 
74*0b57cec5SDimitry Andric   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
75*0b57cec5SDimitry Andric   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
76*0b57cec5SDimitry Andric   /// is returned.
77*0b57cec5SDimitry Andric   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
78*0b57cec5SDimitry Andric 
79*0b57cec5SDimitry Andric   /// \returns True if binary operation \p I is a signed binary operation, false
80*0b57cec5SDimitry Andric   /// otherwise.
81*0b57cec5SDimitry Andric   bool isSigned(const BinaryOperator &I) const;
82*0b57cec5SDimitry Andric 
83*0b57cec5SDimitry Andric   /// \returns True if the condition of 'select' operation \p I comes from a
84*0b57cec5SDimitry Andric   /// signed 'icmp' operation, false otherwise.
85*0b57cec5SDimitry Andric   bool isSigned(const SelectInst &I) const;
86*0b57cec5SDimitry Andric 
87*0b57cec5SDimitry Andric   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
88*0b57cec5SDimitry Andric   /// false otherwise.
89*0b57cec5SDimitry Andric   bool needsPromotionToI32(const Type *T) const;
90*0b57cec5SDimitry Andric 
91*0b57cec5SDimitry Andric   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
92*0b57cec5SDimitry Andric   /// operation.
93*0b57cec5SDimitry Andric   ///
94*0b57cec5SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
95*0b57cec5SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
96*0b57cec5SDimitry Andric   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
97*0b57cec5SDimitry Andric   /// truncating the result of 32 bit binary operation back to \p I's original
98*0b57cec5SDimitry Andric   /// type. Division operation is not promoted.
99*0b57cec5SDimitry Andric   ///
100*0b57cec5SDimitry Andric   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
101*0b57cec5SDimitry Andric   /// false otherwise.
102*0b57cec5SDimitry Andric   bool promoteUniformOpToI32(BinaryOperator &I) const;
103*0b57cec5SDimitry Andric 
104*0b57cec5SDimitry Andric   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
105*0b57cec5SDimitry Andric   ///
106*0b57cec5SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
107*0b57cec5SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
108*0b57cec5SDimitry Andric   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
109*0b57cec5SDimitry Andric   ///
110*0b57cec5SDimitry Andric   /// \returns True.
111*0b57cec5SDimitry Andric   bool promoteUniformOpToI32(ICmpInst &I) const;
112*0b57cec5SDimitry Andric 
113*0b57cec5SDimitry Andric   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
114*0b57cec5SDimitry Andric   /// operation.
115*0b57cec5SDimitry Andric   ///
116*0b57cec5SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
117*0b57cec5SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
118*0b57cec5SDimitry Andric   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
119*0b57cec5SDimitry Andric   /// result of 32 bit 'select' operation back to \p I's original type.
120*0b57cec5SDimitry Andric   ///
121*0b57cec5SDimitry Andric   /// \returns True.
122*0b57cec5SDimitry Andric   bool promoteUniformOpToI32(SelectInst &I) const;
123*0b57cec5SDimitry Andric 
124*0b57cec5SDimitry Andric   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
125*0b57cec5SDimitry Andric   /// intrinsic.
126*0b57cec5SDimitry Andric   ///
127*0b57cec5SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
128*0b57cec5SDimitry Andric   /// than or equal 16. Promotion is done by zero extending the operand to 32
129*0b57cec5SDimitry Andric   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
130*0b57cec5SDimitry Andric   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
131*0b57cec5SDimitry Andric   /// shift amount is 32 minus \p I's base element bit width), and truncating
132*0b57cec5SDimitry Andric   /// the result of the shift operation back to \p I's original type.
133*0b57cec5SDimitry Andric   ///
134*0b57cec5SDimitry Andric   /// \returns True.
135*0b57cec5SDimitry Andric   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
136*0b57cec5SDimitry Andric 
137*0b57cec5SDimitry Andric 
138*0b57cec5SDimitry Andric   unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
139*0b57cec5SDimitry Andric   unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
140*0b57cec5SDimitry Andric   bool isI24(Value *V, unsigned ScalarSize) const;
141*0b57cec5SDimitry Andric   bool isU24(Value *V, unsigned ScalarSize) const;
142*0b57cec5SDimitry Andric 
143*0b57cec5SDimitry Andric   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
144*0b57cec5SDimitry Andric   /// SelectionDAG has an issue where an and asserting the bits are known
145*0b57cec5SDimitry Andric   bool replaceMulWithMul24(BinaryOperator &I) const;
146*0b57cec5SDimitry Andric 
147*0b57cec5SDimitry Andric   /// Expands 24 bit div or rem.
148*0b57cec5SDimitry Andric   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
149*0b57cec5SDimitry Andric                         Value *Num, Value *Den,
150*0b57cec5SDimitry Andric                         bool IsDiv, bool IsSigned) const;
151*0b57cec5SDimitry Andric 
152*0b57cec5SDimitry Andric   /// Expands 32 bit div or rem.
153*0b57cec5SDimitry Andric   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
154*0b57cec5SDimitry Andric                         Value *Num, Value *Den) const;
155*0b57cec5SDimitry Andric 
156*0b57cec5SDimitry Andric   /// Widen a scalar load.
157*0b57cec5SDimitry Andric   ///
158*0b57cec5SDimitry Andric   /// \details \p Widen scalar load for uniform, small type loads from constant
159*0b57cec5SDimitry Andric   //  memory / to a full 32-bits and then truncate the input to allow a scalar
160*0b57cec5SDimitry Andric   //  load instead of a vector load.
161*0b57cec5SDimitry Andric   //
162*0b57cec5SDimitry Andric   /// \returns True.
163*0b57cec5SDimitry Andric 
164*0b57cec5SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &I) const;
165*0b57cec5SDimitry Andric 
166*0b57cec5SDimitry Andric public:
167*0b57cec5SDimitry Andric   static char ID;
168*0b57cec5SDimitry Andric 
169*0b57cec5SDimitry Andric   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
170*0b57cec5SDimitry Andric 
171*0b57cec5SDimitry Andric   bool visitFDiv(BinaryOperator &I);
172*0b57cec5SDimitry Andric 
173*0b57cec5SDimitry Andric   bool visitInstruction(Instruction &I) { return false; }
174*0b57cec5SDimitry Andric   bool visitBinaryOperator(BinaryOperator &I);
175*0b57cec5SDimitry Andric   bool visitLoadInst(LoadInst &I);
176*0b57cec5SDimitry Andric   bool visitICmpInst(ICmpInst &I);
177*0b57cec5SDimitry Andric   bool visitSelectInst(SelectInst &I);
178*0b57cec5SDimitry Andric 
179*0b57cec5SDimitry Andric   bool visitIntrinsicInst(IntrinsicInst &I);
180*0b57cec5SDimitry Andric   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
181*0b57cec5SDimitry Andric 
182*0b57cec5SDimitry Andric   bool doInitialization(Module &M) override;
183*0b57cec5SDimitry Andric   bool runOnFunction(Function &F) override;
184*0b57cec5SDimitry Andric 
185*0b57cec5SDimitry Andric   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
186*0b57cec5SDimitry Andric 
187*0b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
188*0b57cec5SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
189*0b57cec5SDimitry Andric     AU.addRequired<LegacyDivergenceAnalysis>();
190*0b57cec5SDimitry Andric     AU.setPreservesAll();
191*0b57cec5SDimitry Andric  }
192*0b57cec5SDimitry Andric };
193*0b57cec5SDimitry Andric 
194*0b57cec5SDimitry Andric } // end anonymous namespace
195*0b57cec5SDimitry Andric 
196*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
197*0b57cec5SDimitry Andric   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
198*0b57cec5SDimitry Andric 
199*0b57cec5SDimitry Andric   if (T->isIntegerTy())
200*0b57cec5SDimitry Andric     return T->getIntegerBitWidth();
201*0b57cec5SDimitry Andric   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
202*0b57cec5SDimitry Andric }
203*0b57cec5SDimitry Andric 
204*0b57cec5SDimitry Andric Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
205*0b57cec5SDimitry Andric   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
206*0b57cec5SDimitry Andric 
207*0b57cec5SDimitry Andric   if (T->isIntegerTy())
208*0b57cec5SDimitry Andric     return B.getInt32Ty();
209*0b57cec5SDimitry Andric   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
210*0b57cec5SDimitry Andric }
211*0b57cec5SDimitry Andric 
212*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
213*0b57cec5SDimitry Andric   return I.getOpcode() == Instruction::AShr ||
214*0b57cec5SDimitry Andric       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
215*0b57cec5SDimitry Andric }
216*0b57cec5SDimitry Andric 
217*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
218*0b57cec5SDimitry Andric   return isa<ICmpInst>(I.getOperand(0)) ?
219*0b57cec5SDimitry Andric       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
220*0b57cec5SDimitry Andric }
221*0b57cec5SDimitry Andric 
222*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
223*0b57cec5SDimitry Andric   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
224*0b57cec5SDimitry Andric   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
225*0b57cec5SDimitry Andric     return true;
226*0b57cec5SDimitry Andric 
227*0b57cec5SDimitry Andric   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
228*0b57cec5SDimitry Andric     // TODO: The set of packed operations is more limited, so may want to
229*0b57cec5SDimitry Andric     // promote some anyway.
230*0b57cec5SDimitry Andric     if (ST->hasVOP3PInsts())
231*0b57cec5SDimitry Andric       return false;
232*0b57cec5SDimitry Andric 
233*0b57cec5SDimitry Andric     return needsPromotionToI32(VT->getElementType());
234*0b57cec5SDimitry Andric   }
235*0b57cec5SDimitry Andric 
236*0b57cec5SDimitry Andric   return false;
237*0b57cec5SDimitry Andric }
238*0b57cec5SDimitry Andric 
239*0b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nsw set.
240*0b57cec5SDimitry Andric static bool promotedOpIsNSW(const Instruction &I) {
241*0b57cec5SDimitry Andric   switch (I.getOpcode()) {
242*0b57cec5SDimitry Andric   case Instruction::Shl:
243*0b57cec5SDimitry Andric   case Instruction::Add:
244*0b57cec5SDimitry Andric   case Instruction::Sub:
245*0b57cec5SDimitry Andric     return true;
246*0b57cec5SDimitry Andric   case Instruction::Mul:
247*0b57cec5SDimitry Andric     return I.hasNoUnsignedWrap();
248*0b57cec5SDimitry Andric   default:
249*0b57cec5SDimitry Andric     return false;
250*0b57cec5SDimitry Andric   }
251*0b57cec5SDimitry Andric }
252*0b57cec5SDimitry Andric 
253*0b57cec5SDimitry Andric // Return true if the op promoted to i32 should have nuw set.
254*0b57cec5SDimitry Andric static bool promotedOpIsNUW(const Instruction &I) {
255*0b57cec5SDimitry Andric   switch (I.getOpcode()) {
256*0b57cec5SDimitry Andric   case Instruction::Shl:
257*0b57cec5SDimitry Andric   case Instruction::Add:
258*0b57cec5SDimitry Andric   case Instruction::Mul:
259*0b57cec5SDimitry Andric     return true;
260*0b57cec5SDimitry Andric   case Instruction::Sub:
261*0b57cec5SDimitry Andric     return I.hasNoUnsignedWrap();
262*0b57cec5SDimitry Andric   default:
263*0b57cec5SDimitry Andric     return false;
264*0b57cec5SDimitry Andric   }
265*0b57cec5SDimitry Andric }
266*0b57cec5SDimitry Andric 
267*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
268*0b57cec5SDimitry Andric   Type *Ty = I.getType();
269*0b57cec5SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
270*0b57cec5SDimitry Andric   int TySize = DL.getTypeSizeInBits(Ty);
271*0b57cec5SDimitry Andric   unsigned Align = I.getAlignment() ?
272*0b57cec5SDimitry Andric                    I.getAlignment() : DL.getABITypeAlignment(Ty);
273*0b57cec5SDimitry Andric 
274*0b57cec5SDimitry Andric   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
275*0b57cec5SDimitry Andric }
276*0b57cec5SDimitry Andric 
277*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
278*0b57cec5SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
279*0b57cec5SDimitry Andric          "I does not need promotion to i32");
280*0b57cec5SDimitry Andric 
281*0b57cec5SDimitry Andric   if (I.getOpcode() == Instruction::SDiv ||
282*0b57cec5SDimitry Andric       I.getOpcode() == Instruction::UDiv ||
283*0b57cec5SDimitry Andric       I.getOpcode() == Instruction::SRem ||
284*0b57cec5SDimitry Andric       I.getOpcode() == Instruction::URem)
285*0b57cec5SDimitry Andric     return false;
286*0b57cec5SDimitry Andric 
287*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
288*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
289*0b57cec5SDimitry Andric 
290*0b57cec5SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
291*0b57cec5SDimitry Andric   Value *ExtOp0 = nullptr;
292*0b57cec5SDimitry Andric   Value *ExtOp1 = nullptr;
293*0b57cec5SDimitry Andric   Value *ExtRes = nullptr;
294*0b57cec5SDimitry Andric   Value *TruncRes = nullptr;
295*0b57cec5SDimitry Andric 
296*0b57cec5SDimitry Andric   if (isSigned(I)) {
297*0b57cec5SDimitry Andric     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
298*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
299*0b57cec5SDimitry Andric   } else {
300*0b57cec5SDimitry Andric     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
301*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
302*0b57cec5SDimitry Andric   }
303*0b57cec5SDimitry Andric 
304*0b57cec5SDimitry Andric   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
305*0b57cec5SDimitry Andric   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
306*0b57cec5SDimitry Andric     if (promotedOpIsNSW(cast<Instruction>(I)))
307*0b57cec5SDimitry Andric       Inst->setHasNoSignedWrap();
308*0b57cec5SDimitry Andric 
309*0b57cec5SDimitry Andric     if (promotedOpIsNUW(cast<Instruction>(I)))
310*0b57cec5SDimitry Andric       Inst->setHasNoUnsignedWrap();
311*0b57cec5SDimitry Andric 
312*0b57cec5SDimitry Andric     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
313*0b57cec5SDimitry Andric       Inst->setIsExact(ExactOp->isExact());
314*0b57cec5SDimitry Andric   }
315*0b57cec5SDimitry Andric 
316*0b57cec5SDimitry Andric   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
317*0b57cec5SDimitry Andric 
318*0b57cec5SDimitry Andric   I.replaceAllUsesWith(TruncRes);
319*0b57cec5SDimitry Andric   I.eraseFromParent();
320*0b57cec5SDimitry Andric 
321*0b57cec5SDimitry Andric   return true;
322*0b57cec5SDimitry Andric }
323*0b57cec5SDimitry Andric 
324*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
325*0b57cec5SDimitry Andric   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
326*0b57cec5SDimitry Andric          "I does not need promotion to i32");
327*0b57cec5SDimitry Andric 
328*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
329*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
330*0b57cec5SDimitry Andric 
331*0b57cec5SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
332*0b57cec5SDimitry Andric   Value *ExtOp0 = nullptr;
333*0b57cec5SDimitry Andric   Value *ExtOp1 = nullptr;
334*0b57cec5SDimitry Andric   Value *NewICmp  = nullptr;
335*0b57cec5SDimitry Andric 
336*0b57cec5SDimitry Andric   if (I.isSigned()) {
337*0b57cec5SDimitry Andric     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
338*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
339*0b57cec5SDimitry Andric   } else {
340*0b57cec5SDimitry Andric     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
341*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
342*0b57cec5SDimitry Andric   }
343*0b57cec5SDimitry Andric   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
344*0b57cec5SDimitry Andric 
345*0b57cec5SDimitry Andric   I.replaceAllUsesWith(NewICmp);
346*0b57cec5SDimitry Andric   I.eraseFromParent();
347*0b57cec5SDimitry Andric 
348*0b57cec5SDimitry Andric   return true;
349*0b57cec5SDimitry Andric }
350*0b57cec5SDimitry Andric 
351*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
352*0b57cec5SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
353*0b57cec5SDimitry Andric          "I does not need promotion to i32");
354*0b57cec5SDimitry Andric 
355*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
356*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
357*0b57cec5SDimitry Andric 
358*0b57cec5SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
359*0b57cec5SDimitry Andric   Value *ExtOp1 = nullptr;
360*0b57cec5SDimitry Andric   Value *ExtOp2 = nullptr;
361*0b57cec5SDimitry Andric   Value *ExtRes = nullptr;
362*0b57cec5SDimitry Andric   Value *TruncRes = nullptr;
363*0b57cec5SDimitry Andric 
364*0b57cec5SDimitry Andric   if (isSigned(I)) {
365*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
366*0b57cec5SDimitry Andric     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
367*0b57cec5SDimitry Andric   } else {
368*0b57cec5SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
369*0b57cec5SDimitry Andric     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
370*0b57cec5SDimitry Andric   }
371*0b57cec5SDimitry Andric   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
372*0b57cec5SDimitry Andric   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
373*0b57cec5SDimitry Andric 
374*0b57cec5SDimitry Andric   I.replaceAllUsesWith(TruncRes);
375*0b57cec5SDimitry Andric   I.eraseFromParent();
376*0b57cec5SDimitry Andric 
377*0b57cec5SDimitry Andric   return true;
378*0b57cec5SDimitry Andric }
379*0b57cec5SDimitry Andric 
380*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
381*0b57cec5SDimitry Andric     IntrinsicInst &I) const {
382*0b57cec5SDimitry Andric   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
383*0b57cec5SDimitry Andric          "I must be bitreverse intrinsic");
384*0b57cec5SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
385*0b57cec5SDimitry Andric          "I does not need promotion to i32");
386*0b57cec5SDimitry Andric 
387*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
388*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
389*0b57cec5SDimitry Andric 
390*0b57cec5SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
391*0b57cec5SDimitry Andric   Function *I32 =
392*0b57cec5SDimitry Andric       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
393*0b57cec5SDimitry Andric   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
394*0b57cec5SDimitry Andric   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
395*0b57cec5SDimitry Andric   Value *LShrOp =
396*0b57cec5SDimitry Andric       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
397*0b57cec5SDimitry Andric   Value *TruncRes =
398*0b57cec5SDimitry Andric       Builder.CreateTrunc(LShrOp, I.getType());
399*0b57cec5SDimitry Andric 
400*0b57cec5SDimitry Andric   I.replaceAllUsesWith(TruncRes);
401*0b57cec5SDimitry Andric   I.eraseFromParent();
402*0b57cec5SDimitry Andric 
403*0b57cec5SDimitry Andric   return true;
404*0b57cec5SDimitry Andric }
405*0b57cec5SDimitry Andric 
406*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
407*0b57cec5SDimitry Andric                                                unsigned ScalarSize) const {
408*0b57cec5SDimitry Andric   KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
409*0b57cec5SDimitry Andric   return ScalarSize - Known.countMinLeadingZeros();
410*0b57cec5SDimitry Andric }
411*0b57cec5SDimitry Andric 
412*0b57cec5SDimitry Andric unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
413*0b57cec5SDimitry Andric                                              unsigned ScalarSize) const {
414*0b57cec5SDimitry Andric   // In order for this to be a signed 24-bit value, bit 23, must
415*0b57cec5SDimitry Andric   // be a sign bit.
416*0b57cec5SDimitry Andric   return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
417*0b57cec5SDimitry Andric }
418*0b57cec5SDimitry Andric 
419*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
420*0b57cec5SDimitry Andric   return ScalarSize >= 24 && // Types less than 24-bit should be treated
421*0b57cec5SDimitry Andric                                      // as unsigned 24-bit values.
422*0b57cec5SDimitry Andric     numBitsSigned(V, ScalarSize) < 24;
423*0b57cec5SDimitry Andric }
424*0b57cec5SDimitry Andric 
425*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
426*0b57cec5SDimitry Andric   return numBitsUnsigned(V, ScalarSize) <= 24;
427*0b57cec5SDimitry Andric }
428*0b57cec5SDimitry Andric 
429*0b57cec5SDimitry Andric static void extractValues(IRBuilder<> &Builder,
430*0b57cec5SDimitry Andric                           SmallVectorImpl<Value *> &Values, Value *V) {
431*0b57cec5SDimitry Andric   VectorType *VT = dyn_cast<VectorType>(V->getType());
432*0b57cec5SDimitry Andric   if (!VT) {
433*0b57cec5SDimitry Andric     Values.push_back(V);
434*0b57cec5SDimitry Andric     return;
435*0b57cec5SDimitry Andric   }
436*0b57cec5SDimitry Andric 
437*0b57cec5SDimitry Andric   for (int I = 0, E = VT->getNumElements(); I != E; ++I)
438*0b57cec5SDimitry Andric     Values.push_back(Builder.CreateExtractElement(V, I));
439*0b57cec5SDimitry Andric }
440*0b57cec5SDimitry Andric 
441*0b57cec5SDimitry Andric static Value *insertValues(IRBuilder<> &Builder,
442*0b57cec5SDimitry Andric                            Type *Ty,
443*0b57cec5SDimitry Andric                            SmallVectorImpl<Value *> &Values) {
444*0b57cec5SDimitry Andric   if (Values.size() == 1)
445*0b57cec5SDimitry Andric     return Values[0];
446*0b57cec5SDimitry Andric 
447*0b57cec5SDimitry Andric   Value *NewVal = UndefValue::get(Ty);
448*0b57cec5SDimitry Andric   for (int I = 0, E = Values.size(); I != E; ++I)
449*0b57cec5SDimitry Andric     NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
450*0b57cec5SDimitry Andric 
451*0b57cec5SDimitry Andric   return NewVal;
452*0b57cec5SDimitry Andric }
453*0b57cec5SDimitry Andric 
454*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
455*0b57cec5SDimitry Andric   if (I.getOpcode() != Instruction::Mul)
456*0b57cec5SDimitry Andric     return false;
457*0b57cec5SDimitry Andric 
458*0b57cec5SDimitry Andric   Type *Ty = I.getType();
459*0b57cec5SDimitry Andric   unsigned Size = Ty->getScalarSizeInBits();
460*0b57cec5SDimitry Andric   if (Size <= 16 && ST->has16BitInsts())
461*0b57cec5SDimitry Andric     return false;
462*0b57cec5SDimitry Andric 
463*0b57cec5SDimitry Andric   // Prefer scalar if this could be s_mul_i32
464*0b57cec5SDimitry Andric   if (DA->isUniform(&I))
465*0b57cec5SDimitry Andric     return false;
466*0b57cec5SDimitry Andric 
467*0b57cec5SDimitry Andric   Value *LHS = I.getOperand(0);
468*0b57cec5SDimitry Andric   Value *RHS = I.getOperand(1);
469*0b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
470*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
471*0b57cec5SDimitry Andric 
472*0b57cec5SDimitry Andric   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
473*0b57cec5SDimitry Andric 
474*0b57cec5SDimitry Andric   // TODO: Should this try to match mulhi24?
475*0b57cec5SDimitry Andric   if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
476*0b57cec5SDimitry Andric     IntrID = Intrinsic::amdgcn_mul_u24;
477*0b57cec5SDimitry Andric   } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
478*0b57cec5SDimitry Andric     IntrID = Intrinsic::amdgcn_mul_i24;
479*0b57cec5SDimitry Andric   } else
480*0b57cec5SDimitry Andric     return false;
481*0b57cec5SDimitry Andric 
482*0b57cec5SDimitry Andric   SmallVector<Value *, 4> LHSVals;
483*0b57cec5SDimitry Andric   SmallVector<Value *, 4> RHSVals;
484*0b57cec5SDimitry Andric   SmallVector<Value *, 4> ResultVals;
485*0b57cec5SDimitry Andric   extractValues(Builder, LHSVals, LHS);
486*0b57cec5SDimitry Andric   extractValues(Builder, RHSVals, RHS);
487*0b57cec5SDimitry Andric 
488*0b57cec5SDimitry Andric 
489*0b57cec5SDimitry Andric   IntegerType *I32Ty = Builder.getInt32Ty();
490*0b57cec5SDimitry Andric   FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
491*0b57cec5SDimitry Andric   for (int I = 0, E = LHSVals.size(); I != E; ++I) {
492*0b57cec5SDimitry Andric     Value *LHS, *RHS;
493*0b57cec5SDimitry Andric     if (IntrID == Intrinsic::amdgcn_mul_u24) {
494*0b57cec5SDimitry Andric       LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
495*0b57cec5SDimitry Andric       RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
496*0b57cec5SDimitry Andric     } else {
497*0b57cec5SDimitry Andric       LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
498*0b57cec5SDimitry Andric       RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
499*0b57cec5SDimitry Andric     }
500*0b57cec5SDimitry Andric 
501*0b57cec5SDimitry Andric     Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
502*0b57cec5SDimitry Andric 
503*0b57cec5SDimitry Andric     if (IntrID == Intrinsic::amdgcn_mul_u24) {
504*0b57cec5SDimitry Andric       ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
505*0b57cec5SDimitry Andric                                                      LHSVals[I]->getType()));
506*0b57cec5SDimitry Andric     } else {
507*0b57cec5SDimitry Andric       ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
508*0b57cec5SDimitry Andric                                                      LHSVals[I]->getType()));
509*0b57cec5SDimitry Andric     }
510*0b57cec5SDimitry Andric   }
511*0b57cec5SDimitry Andric 
512*0b57cec5SDimitry Andric   I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
513*0b57cec5SDimitry Andric   I.eraseFromParent();
514*0b57cec5SDimitry Andric 
515*0b57cec5SDimitry Andric   return true;
516*0b57cec5SDimitry Andric }
517*0b57cec5SDimitry Andric 
518*0b57cec5SDimitry Andric static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
519*0b57cec5SDimitry Andric   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
520*0b57cec5SDimitry Andric   if (!CNum)
521*0b57cec5SDimitry Andric     return HasDenormals;
522*0b57cec5SDimitry Andric 
523*0b57cec5SDimitry Andric   if (UnsafeDiv)
524*0b57cec5SDimitry Andric     return true;
525*0b57cec5SDimitry Andric 
526*0b57cec5SDimitry Andric   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
527*0b57cec5SDimitry Andric 
528*0b57cec5SDimitry Andric   // Reciprocal f32 is handled separately without denormals.
529*0b57cec5SDimitry Andric   return HasDenormals ^ IsOne;
530*0b57cec5SDimitry Andric }
531*0b57cec5SDimitry Andric 
532*0b57cec5SDimitry Andric // Insert an intrinsic for fast fdiv for safe math situations where we can
533*0b57cec5SDimitry Andric // reduce precision. Leave fdiv for situations where the generic node is
534*0b57cec5SDimitry Andric // expected to be optimized.
535*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
536*0b57cec5SDimitry Andric   Type *Ty = FDiv.getType();
537*0b57cec5SDimitry Andric 
538*0b57cec5SDimitry Andric   if (!Ty->getScalarType()->isFloatTy())
539*0b57cec5SDimitry Andric     return false;
540*0b57cec5SDimitry Andric 
541*0b57cec5SDimitry Andric   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
542*0b57cec5SDimitry Andric   if (!FPMath)
543*0b57cec5SDimitry Andric     return false;
544*0b57cec5SDimitry Andric 
545*0b57cec5SDimitry Andric   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
546*0b57cec5SDimitry Andric   float ULP = FPOp->getFPAccuracy();
547*0b57cec5SDimitry Andric   if (ULP < 2.5f)
548*0b57cec5SDimitry Andric     return false;
549*0b57cec5SDimitry Andric 
550*0b57cec5SDimitry Andric   FastMathFlags FMF = FPOp->getFastMathFlags();
551*0b57cec5SDimitry Andric   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
552*0b57cec5SDimitry Andric                                       FMF.allowReciprocal();
553*0b57cec5SDimitry Andric 
554*0b57cec5SDimitry Andric   // With UnsafeDiv node will be optimized to just rcp and mul.
555*0b57cec5SDimitry Andric   if (UnsafeDiv)
556*0b57cec5SDimitry Andric     return false;
557*0b57cec5SDimitry Andric 
558*0b57cec5SDimitry Andric   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
559*0b57cec5SDimitry Andric   Builder.setFastMathFlags(FMF);
560*0b57cec5SDimitry Andric   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
561*0b57cec5SDimitry Andric 
562*0b57cec5SDimitry Andric   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
563*0b57cec5SDimitry Andric 
564*0b57cec5SDimitry Andric   Value *Num = FDiv.getOperand(0);
565*0b57cec5SDimitry Andric   Value *Den = FDiv.getOperand(1);
566*0b57cec5SDimitry Andric 
567*0b57cec5SDimitry Andric   Value *NewFDiv = nullptr;
568*0b57cec5SDimitry Andric 
569*0b57cec5SDimitry Andric   bool HasDenormals = ST->hasFP32Denormals();
570*0b57cec5SDimitry Andric   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
571*0b57cec5SDimitry Andric     NewFDiv = UndefValue::get(VT);
572*0b57cec5SDimitry Andric 
573*0b57cec5SDimitry Andric     // FIXME: Doesn't do the right thing for cases where the vector is partially
574*0b57cec5SDimitry Andric     // constant. This works when the scalarizer pass is run first.
575*0b57cec5SDimitry Andric     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
576*0b57cec5SDimitry Andric       Value *NumEltI = Builder.CreateExtractElement(Num, I);
577*0b57cec5SDimitry Andric       Value *DenEltI = Builder.CreateExtractElement(Den, I);
578*0b57cec5SDimitry Andric       Value *NewElt;
579*0b57cec5SDimitry Andric 
580*0b57cec5SDimitry Andric       if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
581*0b57cec5SDimitry Andric         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
582*0b57cec5SDimitry Andric       } else {
583*0b57cec5SDimitry Andric         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
584*0b57cec5SDimitry Andric       }
585*0b57cec5SDimitry Andric 
586*0b57cec5SDimitry Andric       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
587*0b57cec5SDimitry Andric     }
588*0b57cec5SDimitry Andric   } else {
589*0b57cec5SDimitry Andric     if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
590*0b57cec5SDimitry Andric       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
591*0b57cec5SDimitry Andric   }
592*0b57cec5SDimitry Andric 
593*0b57cec5SDimitry Andric   if (NewFDiv) {
594*0b57cec5SDimitry Andric     FDiv.replaceAllUsesWith(NewFDiv);
595*0b57cec5SDimitry Andric     NewFDiv->takeName(&FDiv);
596*0b57cec5SDimitry Andric     FDiv.eraseFromParent();
597*0b57cec5SDimitry Andric   }
598*0b57cec5SDimitry Andric 
599*0b57cec5SDimitry Andric   return !!NewFDiv;
600*0b57cec5SDimitry Andric }
601*0b57cec5SDimitry Andric 
602*0b57cec5SDimitry Andric static bool hasUnsafeFPMath(const Function &F) {
603*0b57cec5SDimitry Andric   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
604*0b57cec5SDimitry Andric   return Attr.getValueAsString() == "true";
605*0b57cec5SDimitry Andric }
606*0b57cec5SDimitry Andric 
607*0b57cec5SDimitry Andric static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
608*0b57cec5SDimitry Andric                                           Value *LHS, Value *RHS) {
609*0b57cec5SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
610*0b57cec5SDimitry Andric   Type *I64Ty = Builder.getInt64Ty();
611*0b57cec5SDimitry Andric 
612*0b57cec5SDimitry Andric   Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
613*0b57cec5SDimitry Andric   Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
614*0b57cec5SDimitry Andric   Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
615*0b57cec5SDimitry Andric   Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
616*0b57cec5SDimitry Andric   Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
617*0b57cec5SDimitry Andric   Hi = Builder.CreateTrunc(Hi, I32Ty);
618*0b57cec5SDimitry Andric   return std::make_pair(Lo, Hi);
619*0b57cec5SDimitry Andric }
620*0b57cec5SDimitry Andric 
621*0b57cec5SDimitry Andric static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
622*0b57cec5SDimitry Andric   return getMul64(Builder, LHS, RHS).second;
623*0b57cec5SDimitry Andric }
624*0b57cec5SDimitry Andric 
625*0b57cec5SDimitry Andric // The fractional part of a float is enough to accurately represent up to
626*0b57cec5SDimitry Andric // a 24-bit signed integer.
627*0b57cec5SDimitry Andric Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
628*0b57cec5SDimitry Andric                                             BinaryOperator &I,
629*0b57cec5SDimitry Andric                                             Value *Num, Value *Den,
630*0b57cec5SDimitry Andric                                             bool IsDiv, bool IsSigned) const {
631*0b57cec5SDimitry Andric   assert(Num->getType()->isIntegerTy(32));
632*0b57cec5SDimitry Andric 
633*0b57cec5SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
634*0b57cec5SDimitry Andric   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
635*0b57cec5SDimitry Andric   if (LHSSignBits < 9)
636*0b57cec5SDimitry Andric     return nullptr;
637*0b57cec5SDimitry Andric 
638*0b57cec5SDimitry Andric   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
639*0b57cec5SDimitry Andric   if (RHSSignBits < 9)
640*0b57cec5SDimitry Andric     return nullptr;
641*0b57cec5SDimitry Andric 
642*0b57cec5SDimitry Andric 
643*0b57cec5SDimitry Andric   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
644*0b57cec5SDimitry Andric   unsigned DivBits = 32 - SignBits;
645*0b57cec5SDimitry Andric   if (IsSigned)
646*0b57cec5SDimitry Andric     ++DivBits;
647*0b57cec5SDimitry Andric 
648*0b57cec5SDimitry Andric   Type *Ty = Num->getType();
649*0b57cec5SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
650*0b57cec5SDimitry Andric   Type *F32Ty = Builder.getFloatTy();
651*0b57cec5SDimitry Andric   ConstantInt *One = Builder.getInt32(1);
652*0b57cec5SDimitry Andric   Value *JQ = One;
653*0b57cec5SDimitry Andric 
654*0b57cec5SDimitry Andric   if (IsSigned) {
655*0b57cec5SDimitry Andric     // char|short jq = ia ^ ib;
656*0b57cec5SDimitry Andric     JQ = Builder.CreateXor(Num, Den);
657*0b57cec5SDimitry Andric 
658*0b57cec5SDimitry Andric     // jq = jq >> (bitsize - 2)
659*0b57cec5SDimitry Andric     JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
660*0b57cec5SDimitry Andric 
661*0b57cec5SDimitry Andric     // jq = jq | 0x1
662*0b57cec5SDimitry Andric     JQ = Builder.CreateOr(JQ, One);
663*0b57cec5SDimitry Andric   }
664*0b57cec5SDimitry Andric 
665*0b57cec5SDimitry Andric   // int ia = (int)LHS;
666*0b57cec5SDimitry Andric   Value *IA = Num;
667*0b57cec5SDimitry Andric 
668*0b57cec5SDimitry Andric   // int ib, (int)RHS;
669*0b57cec5SDimitry Andric   Value *IB = Den;
670*0b57cec5SDimitry Andric 
671*0b57cec5SDimitry Andric   // float fa = (float)ia;
672*0b57cec5SDimitry Andric   Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
673*0b57cec5SDimitry Andric                        : Builder.CreateUIToFP(IA, F32Ty);
674*0b57cec5SDimitry Andric 
675*0b57cec5SDimitry Andric   // float fb = (float)ib;
676*0b57cec5SDimitry Andric   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
677*0b57cec5SDimitry Andric                        : Builder.CreateUIToFP(IB,F32Ty);
678*0b57cec5SDimitry Andric 
679*0b57cec5SDimitry Andric   Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
680*0b57cec5SDimitry Andric   Value *FQM = Builder.CreateFMul(FA, RCP);
681*0b57cec5SDimitry Andric 
682*0b57cec5SDimitry Andric   // fq = trunc(fqm);
683*0b57cec5SDimitry Andric   CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
684*0b57cec5SDimitry Andric   FQ->copyFastMathFlags(Builder.getFastMathFlags());
685*0b57cec5SDimitry Andric 
686*0b57cec5SDimitry Andric   // float fqneg = -fq;
687*0b57cec5SDimitry Andric   Value *FQNeg = Builder.CreateFNeg(FQ);
688*0b57cec5SDimitry Andric 
689*0b57cec5SDimitry Andric   // float fr = mad(fqneg, fb, fa);
690*0b57cec5SDimitry Andric   Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
691*0b57cec5SDimitry Andric                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
692*0b57cec5SDimitry Andric 
693*0b57cec5SDimitry Andric   // int iq = (int)fq;
694*0b57cec5SDimitry Andric   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
695*0b57cec5SDimitry Andric                        : Builder.CreateFPToUI(FQ, I32Ty);
696*0b57cec5SDimitry Andric 
697*0b57cec5SDimitry Andric   // fr = fabs(fr);
698*0b57cec5SDimitry Andric   FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
699*0b57cec5SDimitry Andric 
700*0b57cec5SDimitry Andric   // fb = fabs(fb);
701*0b57cec5SDimitry Andric   FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
702*0b57cec5SDimitry Andric 
703*0b57cec5SDimitry Andric   // int cv = fr >= fb;
704*0b57cec5SDimitry Andric   Value *CV = Builder.CreateFCmpOGE(FR, FB);
705*0b57cec5SDimitry Andric 
706*0b57cec5SDimitry Andric   // jq = (cv ? jq : 0);
707*0b57cec5SDimitry Andric   JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
708*0b57cec5SDimitry Andric 
709*0b57cec5SDimitry Andric   // dst = iq + jq;
710*0b57cec5SDimitry Andric   Value *Div = Builder.CreateAdd(IQ, JQ);
711*0b57cec5SDimitry Andric 
712*0b57cec5SDimitry Andric   Value *Res = Div;
713*0b57cec5SDimitry Andric   if (!IsDiv) {
714*0b57cec5SDimitry Andric     // Rem needs compensation, it's easier to recompute it
715*0b57cec5SDimitry Andric     Value *Rem = Builder.CreateMul(Div, Den);
716*0b57cec5SDimitry Andric     Res = Builder.CreateSub(Num, Rem);
717*0b57cec5SDimitry Andric   }
718*0b57cec5SDimitry Andric 
719*0b57cec5SDimitry Andric   // Truncate to number of bits this divide really is.
720*0b57cec5SDimitry Andric   if (IsSigned) {
721*0b57cec5SDimitry Andric     Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
722*0b57cec5SDimitry Andric     Res = Builder.CreateSExt(Res, Ty);
723*0b57cec5SDimitry Andric   } else {
724*0b57cec5SDimitry Andric     ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
725*0b57cec5SDimitry Andric     Res = Builder.CreateAnd(Res, TruncMask);
726*0b57cec5SDimitry Andric   }
727*0b57cec5SDimitry Andric 
728*0b57cec5SDimitry Andric   return Res;
729*0b57cec5SDimitry Andric }
730*0b57cec5SDimitry Andric 
731*0b57cec5SDimitry Andric Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
732*0b57cec5SDimitry Andric                                             BinaryOperator &I,
733*0b57cec5SDimitry Andric                                             Value *Num, Value *Den) const {
734*0b57cec5SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
735*0b57cec5SDimitry Andric   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
736*0b57cec5SDimitry Andric          Opc == Instruction::SRem || Opc == Instruction::SDiv);
737*0b57cec5SDimitry Andric 
738*0b57cec5SDimitry Andric   FastMathFlags FMF;
739*0b57cec5SDimitry Andric   FMF.setFast();
740*0b57cec5SDimitry Andric   Builder.setFastMathFlags(FMF);
741*0b57cec5SDimitry Andric 
742*0b57cec5SDimitry Andric   if (isa<Constant>(Den))
743*0b57cec5SDimitry Andric     return nullptr; // Keep it for optimization
744*0b57cec5SDimitry Andric 
745*0b57cec5SDimitry Andric   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
746*0b57cec5SDimitry Andric   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
747*0b57cec5SDimitry Andric 
748*0b57cec5SDimitry Andric   Type *Ty = Num->getType();
749*0b57cec5SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
750*0b57cec5SDimitry Andric   Type *F32Ty = Builder.getFloatTy();
751*0b57cec5SDimitry Andric 
752*0b57cec5SDimitry Andric   if (Ty->getScalarSizeInBits() < 32) {
753*0b57cec5SDimitry Andric     if (IsSigned) {
754*0b57cec5SDimitry Andric       Num = Builder.CreateSExt(Num, I32Ty);
755*0b57cec5SDimitry Andric       Den = Builder.CreateSExt(Den, I32Ty);
756*0b57cec5SDimitry Andric     } else {
757*0b57cec5SDimitry Andric       Num = Builder.CreateZExt(Num, I32Ty);
758*0b57cec5SDimitry Andric       Den = Builder.CreateZExt(Den, I32Ty);
759*0b57cec5SDimitry Andric     }
760*0b57cec5SDimitry Andric   }
761*0b57cec5SDimitry Andric 
762*0b57cec5SDimitry Andric   if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
763*0b57cec5SDimitry Andric     Res = Builder.CreateTrunc(Res, Ty);
764*0b57cec5SDimitry Andric     return Res;
765*0b57cec5SDimitry Andric   }
766*0b57cec5SDimitry Andric 
767*0b57cec5SDimitry Andric   ConstantInt *Zero = Builder.getInt32(0);
768*0b57cec5SDimitry Andric   ConstantInt *One = Builder.getInt32(1);
769*0b57cec5SDimitry Andric   ConstantInt *MinusOne = Builder.getInt32(~0);
770*0b57cec5SDimitry Andric 
771*0b57cec5SDimitry Andric   Value *Sign = nullptr;
772*0b57cec5SDimitry Andric   if (IsSigned) {
773*0b57cec5SDimitry Andric     ConstantInt *K31 = Builder.getInt32(31);
774*0b57cec5SDimitry Andric     Value *LHSign = Builder.CreateAShr(Num, K31);
775*0b57cec5SDimitry Andric     Value *RHSign = Builder.CreateAShr(Den, K31);
776*0b57cec5SDimitry Andric     // Remainder sign is the same as LHS
777*0b57cec5SDimitry Andric     Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
778*0b57cec5SDimitry Andric 
779*0b57cec5SDimitry Andric     Num = Builder.CreateAdd(Num, LHSign);
780*0b57cec5SDimitry Andric     Den = Builder.CreateAdd(Den, RHSign);
781*0b57cec5SDimitry Andric 
782*0b57cec5SDimitry Andric     Num = Builder.CreateXor(Num, LHSign);
783*0b57cec5SDimitry Andric     Den = Builder.CreateXor(Den, RHSign);
784*0b57cec5SDimitry Andric   }
785*0b57cec5SDimitry Andric 
786*0b57cec5SDimitry Andric   // RCP =  URECIP(Den) = 2^32 / Den + e
787*0b57cec5SDimitry Andric   // e is rounding error.
788*0b57cec5SDimitry Andric   Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
789*0b57cec5SDimitry Andric   Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
790*0b57cec5SDimitry Andric   Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
791*0b57cec5SDimitry Andric   Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
792*0b57cec5SDimitry Andric   Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
793*0b57cec5SDimitry Andric 
794*0b57cec5SDimitry Andric   // RCP_LO, RCP_HI = mul(RCP, Den) */
795*0b57cec5SDimitry Andric   Value *RCP_LO, *RCP_HI;
796*0b57cec5SDimitry Andric   std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
797*0b57cec5SDimitry Andric 
798*0b57cec5SDimitry Andric   // NEG_RCP_LO = -RCP_LO
799*0b57cec5SDimitry Andric   Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
800*0b57cec5SDimitry Andric 
801*0b57cec5SDimitry Andric   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
802*0b57cec5SDimitry Andric   Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
803*0b57cec5SDimitry Andric   Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
804*0b57cec5SDimitry Andric 
805*0b57cec5SDimitry Andric   // Calculate the rounding error from the URECIP instruction
806*0b57cec5SDimitry Andric   // E = mulhu(ABS_RCP_LO, RCP)
807*0b57cec5SDimitry Andric   Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
808*0b57cec5SDimitry Andric 
809*0b57cec5SDimitry Andric   // RCP_A_E = RCP + E
810*0b57cec5SDimitry Andric   Value *RCP_A_E = Builder.CreateAdd(RCP, E);
811*0b57cec5SDimitry Andric 
812*0b57cec5SDimitry Andric   // RCP_S_E = RCP - E
813*0b57cec5SDimitry Andric   Value *RCP_S_E = Builder.CreateSub(RCP, E);
814*0b57cec5SDimitry Andric 
815*0b57cec5SDimitry Andric   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
816*0b57cec5SDimitry Andric   Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
817*0b57cec5SDimitry Andric 
818*0b57cec5SDimitry Andric   // Quotient = mulhu(Tmp0, Num)
819*0b57cec5SDimitry Andric   Value *Quotient = getMulHu(Builder, Tmp0, Num);
820*0b57cec5SDimitry Andric 
821*0b57cec5SDimitry Andric   // Num_S_Remainder = Quotient * Den
822*0b57cec5SDimitry Andric   Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
823*0b57cec5SDimitry Andric 
824*0b57cec5SDimitry Andric   // Remainder = Num - Num_S_Remainder
825*0b57cec5SDimitry Andric   Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
826*0b57cec5SDimitry Andric 
827*0b57cec5SDimitry Andric   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
828*0b57cec5SDimitry Andric   Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
829*0b57cec5SDimitry Andric   Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
830*0b57cec5SDimitry Andric 
831*0b57cec5SDimitry Andric   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
832*0b57cec5SDimitry Andric   Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
833*0b57cec5SDimitry Andric   Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
834*0b57cec5SDimitry Andric                                                   MinusOne, Zero);
835*0b57cec5SDimitry Andric 
836*0b57cec5SDimitry Andric   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
837*0b57cec5SDimitry Andric   Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
838*0b57cec5SDimitry Andric   Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
839*0b57cec5SDimitry Andric 
840*0b57cec5SDimitry Andric   Value *Res;
841*0b57cec5SDimitry Andric   if (IsDiv) {
842*0b57cec5SDimitry Andric     // Quotient_A_One = Quotient + 1
843*0b57cec5SDimitry Andric     Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
844*0b57cec5SDimitry Andric 
845*0b57cec5SDimitry Andric     // Quotient_S_One = Quotient - 1
846*0b57cec5SDimitry Andric     Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
847*0b57cec5SDimitry Andric 
848*0b57cec5SDimitry Andric     // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
849*0b57cec5SDimitry Andric     Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
850*0b57cec5SDimitry Andric 
851*0b57cec5SDimitry Andric     // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
852*0b57cec5SDimitry Andric     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
853*0b57cec5SDimitry Andric   } else {
854*0b57cec5SDimitry Andric     // Remainder_S_Den = Remainder - Den
855*0b57cec5SDimitry Andric     Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
856*0b57cec5SDimitry Andric 
857*0b57cec5SDimitry Andric     // Remainder_A_Den = Remainder + Den
858*0b57cec5SDimitry Andric     Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
859*0b57cec5SDimitry Andric 
860*0b57cec5SDimitry Andric     // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
861*0b57cec5SDimitry Andric     Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
862*0b57cec5SDimitry Andric 
863*0b57cec5SDimitry Andric     // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
864*0b57cec5SDimitry Andric     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
865*0b57cec5SDimitry Andric   }
866*0b57cec5SDimitry Andric 
867*0b57cec5SDimitry Andric   if (IsSigned) {
868*0b57cec5SDimitry Andric     Res = Builder.CreateXor(Res, Sign);
869*0b57cec5SDimitry Andric     Res = Builder.CreateSub(Res, Sign);
870*0b57cec5SDimitry Andric   }
871*0b57cec5SDimitry Andric 
872*0b57cec5SDimitry Andric   Res = Builder.CreateTrunc(Res, Ty);
873*0b57cec5SDimitry Andric 
874*0b57cec5SDimitry Andric   return Res;
875*0b57cec5SDimitry Andric }
876*0b57cec5SDimitry Andric 
877*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
878*0b57cec5SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
879*0b57cec5SDimitry Andric       DA->isUniform(&I) && promoteUniformOpToI32(I))
880*0b57cec5SDimitry Andric     return true;
881*0b57cec5SDimitry Andric 
882*0b57cec5SDimitry Andric   if (replaceMulWithMul24(I))
883*0b57cec5SDimitry Andric     return true;
884*0b57cec5SDimitry Andric 
885*0b57cec5SDimitry Andric   bool Changed = false;
886*0b57cec5SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
887*0b57cec5SDimitry Andric   Type *Ty = I.getType();
888*0b57cec5SDimitry Andric   Value *NewDiv = nullptr;
889*0b57cec5SDimitry Andric   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
890*0b57cec5SDimitry Andric        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
891*0b57cec5SDimitry Andric       Ty->getScalarSizeInBits() <= 32) {
892*0b57cec5SDimitry Andric     Value *Num = I.getOperand(0);
893*0b57cec5SDimitry Andric     Value *Den = I.getOperand(1);
894*0b57cec5SDimitry Andric     IRBuilder<> Builder(&I);
895*0b57cec5SDimitry Andric     Builder.SetCurrentDebugLocation(I.getDebugLoc());
896*0b57cec5SDimitry Andric 
897*0b57cec5SDimitry Andric     if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
898*0b57cec5SDimitry Andric       NewDiv = UndefValue::get(VT);
899*0b57cec5SDimitry Andric 
900*0b57cec5SDimitry Andric       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
901*0b57cec5SDimitry Andric         Value *NumEltN = Builder.CreateExtractElement(Num, N);
902*0b57cec5SDimitry Andric         Value *DenEltN = Builder.CreateExtractElement(Den, N);
903*0b57cec5SDimitry Andric         Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
904*0b57cec5SDimitry Andric         if (!NewElt)
905*0b57cec5SDimitry Andric           NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
906*0b57cec5SDimitry Andric         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
907*0b57cec5SDimitry Andric       }
908*0b57cec5SDimitry Andric     } else {
909*0b57cec5SDimitry Andric       NewDiv = expandDivRem32(Builder, I, Num, Den);
910*0b57cec5SDimitry Andric     }
911*0b57cec5SDimitry Andric 
912*0b57cec5SDimitry Andric     if (NewDiv) {
913*0b57cec5SDimitry Andric       I.replaceAllUsesWith(NewDiv);
914*0b57cec5SDimitry Andric       I.eraseFromParent();
915*0b57cec5SDimitry Andric       Changed = true;
916*0b57cec5SDimitry Andric     }
917*0b57cec5SDimitry Andric   }
918*0b57cec5SDimitry Andric 
919*0b57cec5SDimitry Andric   return Changed;
920*0b57cec5SDimitry Andric }
921*0b57cec5SDimitry Andric 
922*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
923*0b57cec5SDimitry Andric   if (!WidenLoads)
924*0b57cec5SDimitry Andric     return false;
925*0b57cec5SDimitry Andric 
926*0b57cec5SDimitry Andric   if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
927*0b57cec5SDimitry Andric        I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
928*0b57cec5SDimitry Andric       canWidenScalarExtLoad(I)) {
929*0b57cec5SDimitry Andric     IRBuilder<> Builder(&I);
930*0b57cec5SDimitry Andric     Builder.SetCurrentDebugLocation(I.getDebugLoc());
931*0b57cec5SDimitry Andric 
932*0b57cec5SDimitry Andric     Type *I32Ty = Builder.getInt32Ty();
933*0b57cec5SDimitry Andric     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
934*0b57cec5SDimitry Andric     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
935*0b57cec5SDimitry Andric     LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
936*0b57cec5SDimitry Andric     WidenLoad->copyMetadata(I);
937*0b57cec5SDimitry Andric 
938*0b57cec5SDimitry Andric     // If we have range metadata, we need to convert the type, and not make
939*0b57cec5SDimitry Andric     // assumptions about the high bits.
940*0b57cec5SDimitry Andric     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
941*0b57cec5SDimitry Andric       ConstantInt *Lower =
942*0b57cec5SDimitry Andric         mdconst::extract<ConstantInt>(Range->getOperand(0));
943*0b57cec5SDimitry Andric 
944*0b57cec5SDimitry Andric       if (Lower->getValue().isNullValue()) {
945*0b57cec5SDimitry Andric         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
946*0b57cec5SDimitry Andric       } else {
947*0b57cec5SDimitry Andric         Metadata *LowAndHigh[] = {
948*0b57cec5SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
949*0b57cec5SDimitry Andric           // Don't make assumptions about the high bits.
950*0b57cec5SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
951*0b57cec5SDimitry Andric         };
952*0b57cec5SDimitry Andric 
953*0b57cec5SDimitry Andric         WidenLoad->setMetadata(LLVMContext::MD_range,
954*0b57cec5SDimitry Andric                                MDNode::get(Mod->getContext(), LowAndHigh));
955*0b57cec5SDimitry Andric       }
956*0b57cec5SDimitry Andric     }
957*0b57cec5SDimitry Andric 
958*0b57cec5SDimitry Andric     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
959*0b57cec5SDimitry Andric     Type *IntNTy = Builder.getIntNTy(TySize);
960*0b57cec5SDimitry Andric     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
961*0b57cec5SDimitry Andric     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
962*0b57cec5SDimitry Andric     I.replaceAllUsesWith(ValOrig);
963*0b57cec5SDimitry Andric     I.eraseFromParent();
964*0b57cec5SDimitry Andric     return true;
965*0b57cec5SDimitry Andric   }
966*0b57cec5SDimitry Andric 
967*0b57cec5SDimitry Andric   return false;
968*0b57cec5SDimitry Andric }
969*0b57cec5SDimitry Andric 
970*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
971*0b57cec5SDimitry Andric   bool Changed = false;
972*0b57cec5SDimitry Andric 
973*0b57cec5SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
974*0b57cec5SDimitry Andric       DA->isUniform(&I))
975*0b57cec5SDimitry Andric     Changed |= promoteUniformOpToI32(I);
976*0b57cec5SDimitry Andric 
977*0b57cec5SDimitry Andric   return Changed;
978*0b57cec5SDimitry Andric }
979*0b57cec5SDimitry Andric 
980*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
981*0b57cec5SDimitry Andric   bool Changed = false;
982*0b57cec5SDimitry Andric 
983*0b57cec5SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
984*0b57cec5SDimitry Andric       DA->isUniform(&I))
985*0b57cec5SDimitry Andric     Changed |= promoteUniformOpToI32(I);
986*0b57cec5SDimitry Andric 
987*0b57cec5SDimitry Andric   return Changed;
988*0b57cec5SDimitry Andric }
989*0b57cec5SDimitry Andric 
990*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
991*0b57cec5SDimitry Andric   switch (I.getIntrinsicID()) {
992*0b57cec5SDimitry Andric   case Intrinsic::bitreverse:
993*0b57cec5SDimitry Andric     return visitBitreverseIntrinsicInst(I);
994*0b57cec5SDimitry Andric   default:
995*0b57cec5SDimitry Andric     return false;
996*0b57cec5SDimitry Andric   }
997*0b57cec5SDimitry Andric }
998*0b57cec5SDimitry Andric 
999*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1000*0b57cec5SDimitry Andric   bool Changed = false;
1001*0b57cec5SDimitry Andric 
1002*0b57cec5SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1003*0b57cec5SDimitry Andric       DA->isUniform(&I))
1004*0b57cec5SDimitry Andric     Changed |= promoteUniformBitreverseToI32(I);
1005*0b57cec5SDimitry Andric 
1006*0b57cec5SDimitry Andric   return Changed;
1007*0b57cec5SDimitry Andric }
1008*0b57cec5SDimitry Andric 
1009*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1010*0b57cec5SDimitry Andric   Mod = &M;
1011*0b57cec5SDimitry Andric   DL = &Mod->getDataLayout();
1012*0b57cec5SDimitry Andric   return false;
1013*0b57cec5SDimitry Andric }
1014*0b57cec5SDimitry Andric 
1015*0b57cec5SDimitry Andric bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
1016*0b57cec5SDimitry Andric   if (skipFunction(F))
1017*0b57cec5SDimitry Andric     return false;
1018*0b57cec5SDimitry Andric 
1019*0b57cec5SDimitry Andric   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1020*0b57cec5SDimitry Andric   if (!TPC)
1021*0b57cec5SDimitry Andric     return false;
1022*0b57cec5SDimitry Andric 
1023*0b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1024*0b57cec5SDimitry Andric   ST = &TM.getSubtarget<GCNSubtarget>(F);
1025*0b57cec5SDimitry Andric   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1026*0b57cec5SDimitry Andric   DA = &getAnalysis<LegacyDivergenceAnalysis>();
1027*0b57cec5SDimitry Andric   HasUnsafeFPMath = hasUnsafeFPMath(F);
1028*0b57cec5SDimitry Andric 
1029*0b57cec5SDimitry Andric   bool MadeChange = false;
1030*0b57cec5SDimitry Andric 
1031*0b57cec5SDimitry Andric   for (BasicBlock &BB : F) {
1032*0b57cec5SDimitry Andric     BasicBlock::iterator Next;
1033*0b57cec5SDimitry Andric     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
1034*0b57cec5SDimitry Andric       Next = std::next(I);
1035*0b57cec5SDimitry Andric       MadeChange |= visit(*I);
1036*0b57cec5SDimitry Andric     }
1037*0b57cec5SDimitry Andric   }
1038*0b57cec5SDimitry Andric 
1039*0b57cec5SDimitry Andric   return MadeChange;
1040*0b57cec5SDimitry Andric }
1041*0b57cec5SDimitry Andric 
1042*0b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1043*0b57cec5SDimitry Andric                       "AMDGPU IR optimizations", false, false)
1044*0b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1045*0b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
1046*0b57cec5SDimitry Andric INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
1047*0b57cec5SDimitry Andric                     false, false)
1048*0b57cec5SDimitry Andric 
1049*0b57cec5SDimitry Andric char AMDGPUCodeGenPrepare::ID = 0;
1050*0b57cec5SDimitry Andric 
1051*0b57cec5SDimitry Andric FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
1052*0b57cec5SDimitry Andric   return new AMDGPUCodeGenPrepare();
1053*0b57cec5SDimitry Andric }
1054