1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIModeRegisterDefaults.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/ConstantFolding.h"
20 #include "llvm/Analysis/TargetLibraryInfo.h"
21 #include "llvm/Analysis/UniformityAnalysis.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/InstVisitor.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/PatternMatch.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Transforms/Utils/IntegerDivision.h"
33 #include "llvm/Transforms/Utils/Local.h"
34
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
36
37 using namespace llvm;
38 using namespace llvm::PatternMatch;
39
40 namespace {
41
42 static cl::opt<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45 cl::ReallyHidden,
46 cl::init(false));
47
48 static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51 cl::ReallyHidden,
52 cl::init(true));
53
54 static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden, cl::init(true));
58
59 static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden, cl::init(false));
64
65 static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden, cl::init(32));
69
70 static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::ReallyHidden,
74 cl::init(true));
75
76 // Legalize 64-bit division by using the generic IR expansion.
77 static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::ReallyHidden,
81 cl::init(false));
82
83 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84 // and is used for testing the legalizer.
85 static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::ReallyHidden,
89 cl::init(false));
90
91 // Disable processing of fdiv so we can better test the backend implementations.
92 static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::ReallyHidden,
96 cl::init(false));
97
98 class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100 public:
101 const GCNSubtarget *ST = nullptr;
102 const AMDGPUTargetMachine *TM = nullptr;
103 const TargetLibraryInfo *TLInfo = nullptr;
104 AssumptionCache *AC = nullptr;
105 DominatorTree *DT = nullptr;
106 UniformityInfo *UA = nullptr;
107 Module *Mod = nullptr;
108 const DataLayout *DL = nullptr;
109 bool HasUnsafeFPMath = false;
110 bool HasFP32DenormalFlush = false;
111 bool FlowChanged = false;
112 mutable Function *SqrtF32 = nullptr;
113 mutable Function *LdexpF32 = nullptr;
114
115 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
116
getSqrtF32() const117 Function *getSqrtF32() const {
118 if (SqrtF32)
119 return SqrtF32;
120
121 LLVMContext &Ctx = Mod->getContext();
122 SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
123 {Type::getFloatTy(Ctx)});
124 return SqrtF32;
125 }
126
getLdexpF32() const127 Function *getLdexpF32() const {
128 if (LdexpF32)
129 return LdexpF32;
130
131 LLVMContext &Ctx = Mod->getContext();
132 LdexpF32 = Intrinsic::getDeclaration(
133 Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
134 return LdexpF32;
135 }
136
137 bool canBreakPHINode(const PHINode &I);
138
139 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140 /// binary operation \p V.
141 ///
142 /// \returns Binary operation \p V.
143 /// \returns \p T's base element bit width.
144 unsigned getBaseElementBitWidth(const Type *T) const;
145
146 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
148 /// is returned.
149 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
150
151 /// \returns True if binary operation \p I is a signed binary operation, false
152 /// otherwise.
153 bool isSigned(const BinaryOperator &I) const;
154
155 /// \returns True if the condition of 'select' operation \p I comes from a
156 /// signed 'icmp' operation, false otherwise.
157 bool isSigned(const SelectInst &I) const;
158
159 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
160 /// false otherwise.
161 bool needsPromotionToI32(const Type *T) const;
162
163 /// Return true if \p T is a legal scalar floating point type.
164 bool isLegalFloatingTy(const Type *T) const;
165
166 /// Wrapper to pass all the arguments to computeKnownFPClass
computeKnownFPClass(const Value * V,FPClassTest Interested,const Instruction * CtxI) const167 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
168 const Instruction *CtxI) const {
169 return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
170 DT);
171 }
172
canIgnoreDenormalInput(const Value * V,const Instruction * CtxI) const173 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
174 return HasFP32DenormalFlush ||
175 computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
176 }
177
178 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
179 /// operation.
180 ///
181 /// \details \p I's base element bit width must be greater than 1 and less
182 /// than or equal 16. Promotion is done by sign or zero extending operands to
183 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184 /// truncating the result of 32 bit binary operation back to \p I's original
185 /// type. Division operation is not promoted.
186 ///
187 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
188 /// false otherwise.
189 bool promoteUniformOpToI32(BinaryOperator &I) const;
190
191 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
192 ///
193 /// \details \p I's base element bit width must be greater than 1 and less
194 /// than or equal 16. Promotion is done by sign or zero extending operands to
195 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
196 ///
197 /// \returns True.
198 bool promoteUniformOpToI32(ICmpInst &I) const;
199
200 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
201 /// operation.
202 ///
203 /// \details \p I's base element bit width must be greater than 1 and less
204 /// than or equal 16. Promotion is done by sign or zero extending operands to
205 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206 /// result of 32 bit 'select' operation back to \p I's original type.
207 ///
208 /// \returns True.
209 bool promoteUniformOpToI32(SelectInst &I) const;
210
211 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
212 /// intrinsic.
213 ///
214 /// \details \p I's base element bit width must be greater than 1 and less
215 /// than or equal 16. Promotion is done by zero extending the operand to 32
216 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218 /// shift amount is 32 minus \p I's base element bit width), and truncating
219 /// the result of the shift operation back to \p I's original type.
220 ///
221 /// \returns True.
222 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
223
224 /// \returns The minimum number of bits needed to store the value of \Op as an
225 /// unsigned integer. Truncating to this size and then zero-extending to
226 /// the original will not change the value.
227 unsigned numBitsUnsigned(Value *Op) const;
228
229 /// \returns The minimum number of bits needed to store the value of \Op as a
230 /// signed integer. Truncating to this size and then sign-extending to
231 /// the original size will not change the value.
232 unsigned numBitsSigned(Value *Op) const;
233
234 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235 /// SelectionDAG has an issue where an and asserting the bits are known
236 bool replaceMulWithMul24(BinaryOperator &I) const;
237
238 /// Perform same function as equivalently named function in DAGCombiner. Since
239 /// we expand some divisions here, we need to perform this before obscuring.
240 bool foldBinOpIntoSelect(BinaryOperator &I) const;
241
242 bool divHasSpecialOptimization(BinaryOperator &I,
243 Value *Num, Value *Den) const;
244 int getDivNumBits(BinaryOperator &I,
245 Value *Num, Value *Den,
246 unsigned AtLeast, bool Signed) const;
247
248 /// Expands 24 bit div or rem.
249 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
250 Value *Num, Value *Den,
251 bool IsDiv, bool IsSigned) const;
252
253 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
254 Value *Num, Value *Den, unsigned NumBits,
255 bool IsDiv, bool IsSigned) const;
256
257 /// Expands 32 bit div or rem.
258 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
259 Value *Num, Value *Den) const;
260
261 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
262 Value *Num, Value *Den) const;
263 void expandDivRem64(BinaryOperator &I) const;
264
265 /// Widen a scalar load.
266 ///
267 /// \details \p Widen scalar load for uniform, small type loads from constant
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
269 // load instead of a vector load.
270 //
271 /// \returns True.
272
273 bool canWidenScalarExtLoad(LoadInst &I) const;
274
275 Value *matchFractPat(IntrinsicInst &I);
276 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
277
278 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
279 FastMathFlags SqrtFMF) const;
280
281 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
282 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
283 const Instruction *CtxI) const;
284
285 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
286 FastMathFlags FMF, const Instruction *CtxI) const;
287 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
288 float ReqdAccuracy) const;
289
290 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
291 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
292 Value *RsqOp, const Instruction *FDiv,
293 float ReqdAccuracy) const;
294
295 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
296 Value *Src) const;
297
298 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
299 bool IsNegative) const;
300 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
301 FastMathFlags FMF) const;
302 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
303 FastMathFlags FMF) const;
304
305 public:
306 bool visitFDiv(BinaryOperator &I);
307
visitInstruction(Instruction & I)308 bool visitInstruction(Instruction &I) { return false; }
309 bool visitBinaryOperator(BinaryOperator &I);
310 bool visitLoadInst(LoadInst &I);
311 bool visitICmpInst(ICmpInst &I);
312 bool visitSelectInst(SelectInst &I);
313 bool visitPHINode(PHINode &I);
314 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
315
316 bool visitIntrinsicInst(IntrinsicInst &I);
317 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
318 bool visitMinNum(IntrinsicInst &I);
319 bool visitSqrt(IntrinsicInst &I);
320 bool run(Function &F);
321 };
322
323 class AMDGPUCodeGenPrepare : public FunctionPass {
324 private:
325 AMDGPUCodeGenPrepareImpl Impl;
326
327 public:
328 static char ID;
AMDGPUCodeGenPrepare()329 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
330 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
331 }
getAnalysisUsage(AnalysisUsage & AU) const332 void getAnalysisUsage(AnalysisUsage &AU) const override {
333 AU.addRequired<AssumptionCacheTracker>();
334 AU.addRequired<UniformityInfoWrapperPass>();
335 AU.addRequired<TargetLibraryInfoWrapperPass>();
336
337 // FIXME: Division expansion needs to preserve the dominator tree.
338 if (!ExpandDiv64InIR)
339 AU.setPreservesAll();
340 }
341 bool runOnFunction(Function &F) override;
342 bool doInitialization(Module &M) override;
getPassName() const343 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
344 };
345
346 } // end anonymous namespace
347
run(Function & F)348 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
349 BreakPhiNodesCache.clear();
350 bool MadeChange = false;
351
352 Function::iterator NextBB;
353 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
354 BasicBlock *BB = &*FI;
355 NextBB = std::next(FI);
356
357 BasicBlock::iterator Next;
358 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
359 I = Next) {
360 Next = std::next(I);
361
362 MadeChange |= visit(*I);
363
364 if (Next != E) { // Control flow changed
365 BasicBlock *NextInstBB = Next->getParent();
366 if (NextInstBB != BB) {
367 BB = NextInstBB;
368 E = BB->end();
369 FE = F.end();
370 }
371 }
372 }
373 }
374 return MadeChange;
375 }
376
getBaseElementBitWidth(const Type * T) const377 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
379
380 if (T->isIntegerTy())
381 return T->getIntegerBitWidth();
382 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
383 }
384
getI32Ty(IRBuilder<> & B,const Type * T) const385 Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
386 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
387
388 if (T->isIntegerTy())
389 return B.getInt32Ty();
390 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
391 }
392
isSigned(const BinaryOperator & I) const393 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
394 return I.getOpcode() == Instruction::AShr ||
395 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
396 }
397
isSigned(const SelectInst & I) const398 bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
399 return isa<ICmpInst>(I.getOperand(0)) ?
400 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
401 }
402
needsPromotionToI32(const Type * T) const403 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
404 if (!Widen16BitOps)
405 return false;
406
407 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
408 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
409 return true;
410
411 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
412 // TODO: The set of packed operations is more limited, so may want to
413 // promote some anyway.
414 if (ST->hasVOP3PInsts())
415 return false;
416
417 return needsPromotionToI32(VT->getElementType());
418 }
419
420 return false;
421 }
422
isLegalFloatingTy(const Type * Ty) const423 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
424 return Ty->isFloatTy() || Ty->isDoubleTy() ||
425 (Ty->isHalfTy() && ST->has16BitInsts());
426 }
427
428 // Return true if the op promoted to i32 should have nsw set.
promotedOpIsNSW(const Instruction & I)429 static bool promotedOpIsNSW(const Instruction &I) {
430 switch (I.getOpcode()) {
431 case Instruction::Shl:
432 case Instruction::Add:
433 case Instruction::Sub:
434 return true;
435 case Instruction::Mul:
436 return I.hasNoUnsignedWrap();
437 default:
438 return false;
439 }
440 }
441
442 // Return true if the op promoted to i32 should have nuw set.
promotedOpIsNUW(const Instruction & I)443 static bool promotedOpIsNUW(const Instruction &I) {
444 switch (I.getOpcode()) {
445 case Instruction::Shl:
446 case Instruction::Add:
447 case Instruction::Mul:
448 return true;
449 case Instruction::Sub:
450 return I.hasNoUnsignedWrap();
451 default:
452 return false;
453 }
454 }
455
canWidenScalarExtLoad(LoadInst & I) const456 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
457 Type *Ty = I.getType();
458 const DataLayout &DL = Mod->getDataLayout();
459 int TySize = DL.getTypeSizeInBits(Ty);
460 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
461
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
463 }
464
promoteUniformOpToI32(BinaryOperator & I) const465 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
466 assert(needsPromotionToI32(I.getType()) &&
467 "I does not need promotion to i32");
468
469 if (I.getOpcode() == Instruction::SDiv ||
470 I.getOpcode() == Instruction::UDiv ||
471 I.getOpcode() == Instruction::SRem ||
472 I.getOpcode() == Instruction::URem)
473 return false;
474
475 IRBuilder<> Builder(&I);
476 Builder.SetCurrentDebugLocation(I.getDebugLoc());
477
478 Type *I32Ty = getI32Ty(Builder, I.getType());
479 Value *ExtOp0 = nullptr;
480 Value *ExtOp1 = nullptr;
481 Value *ExtRes = nullptr;
482 Value *TruncRes = nullptr;
483
484 if (isSigned(I)) {
485 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
486 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
487 } else {
488 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
489 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
490 }
491
492 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
493 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
494 if (promotedOpIsNSW(cast<Instruction>(I)))
495 Inst->setHasNoSignedWrap();
496
497 if (promotedOpIsNUW(cast<Instruction>(I)))
498 Inst->setHasNoUnsignedWrap();
499
500 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
501 Inst->setIsExact(ExactOp->isExact());
502 }
503
504 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
505
506 I.replaceAllUsesWith(TruncRes);
507 I.eraseFromParent();
508
509 return true;
510 }
511
promoteUniformOpToI32(ICmpInst & I) const512 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
513 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
515
516 IRBuilder<> Builder(&I);
517 Builder.SetCurrentDebugLocation(I.getDebugLoc());
518
519 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
520 Value *ExtOp0 = nullptr;
521 Value *ExtOp1 = nullptr;
522 Value *NewICmp = nullptr;
523
524 if (I.isSigned()) {
525 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
526 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
527 } else {
528 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
529 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
530 }
531 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
532
533 I.replaceAllUsesWith(NewICmp);
534 I.eraseFromParent();
535
536 return true;
537 }
538
promoteUniformOpToI32(SelectInst & I) const539 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
540 assert(needsPromotionToI32(I.getType()) &&
541 "I does not need promotion to i32");
542
543 IRBuilder<> Builder(&I);
544 Builder.SetCurrentDebugLocation(I.getDebugLoc());
545
546 Type *I32Ty = getI32Ty(Builder, I.getType());
547 Value *ExtOp1 = nullptr;
548 Value *ExtOp2 = nullptr;
549 Value *ExtRes = nullptr;
550 Value *TruncRes = nullptr;
551
552 if (isSigned(I)) {
553 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
554 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
555 } else {
556 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
557 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
558 }
559 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
560 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
561
562 I.replaceAllUsesWith(TruncRes);
563 I.eraseFromParent();
564
565 return true;
566 }
567
promoteUniformBitreverseToI32(IntrinsicInst & I) const568 bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569 IntrinsicInst &I) const {
570 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(I.getType()) &&
573 "I does not need promotion to i32");
574
575 IRBuilder<> Builder(&I);
576 Builder.SetCurrentDebugLocation(I.getDebugLoc());
577
578 Type *I32Ty = getI32Ty(Builder, I.getType());
579 Function *I32 =
580 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
581 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
582 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
583 Value *LShrOp =
584 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
585 Value *TruncRes =
586 Builder.CreateTrunc(LShrOp, I.getType());
587
588 I.replaceAllUsesWith(TruncRes);
589 I.eraseFromParent();
590
591 return true;
592 }
593
numBitsUnsigned(Value * Op) const594 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
595 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
596 }
597
numBitsSigned(Value * Op) const598 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
599 return ComputeMaxSignificantBits(Op, *DL, 0, AC);
600 }
601
extractValues(IRBuilder<> & Builder,SmallVectorImpl<Value * > & Values,Value * V)602 static void extractValues(IRBuilder<> &Builder,
603 SmallVectorImpl<Value *> &Values, Value *V) {
604 auto *VT = dyn_cast<FixedVectorType>(V->getType());
605 if (!VT) {
606 Values.push_back(V);
607 return;
608 }
609
610 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
611 Values.push_back(Builder.CreateExtractElement(V, I));
612 }
613
insertValues(IRBuilder<> & Builder,Type * Ty,SmallVectorImpl<Value * > & Values)614 static Value *insertValues(IRBuilder<> &Builder,
615 Type *Ty,
616 SmallVectorImpl<Value *> &Values) {
617 if (!Ty->isVectorTy()) {
618 assert(Values.size() == 1);
619 return Values[0];
620 }
621
622 Value *NewVal = PoisonValue::get(Ty);
623 for (int I = 0, E = Values.size(); I != E; ++I)
624 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
625
626 return NewVal;
627 }
628
replaceMulWithMul24(BinaryOperator & I) const629 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
630 if (I.getOpcode() != Instruction::Mul)
631 return false;
632
633 Type *Ty = I.getType();
634 unsigned Size = Ty->getScalarSizeInBits();
635 if (Size <= 16 && ST->has16BitInsts())
636 return false;
637
638 // Prefer scalar if this could be s_mul_i32
639 if (UA->isUniform(&I))
640 return false;
641
642 Value *LHS = I.getOperand(0);
643 Value *RHS = I.getOperand(1);
644 IRBuilder<> Builder(&I);
645 Builder.SetCurrentDebugLocation(I.getDebugLoc());
646
647 unsigned LHSBits = 0, RHSBits = 0;
648 bool IsSigned = false;
649
650 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
651 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
652 IsSigned = false;
653
654 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
655 (RHSBits = numBitsSigned(RHS)) <= 24) {
656 IsSigned = true;
657
658 } else
659 return false;
660
661 SmallVector<Value *, 4> LHSVals;
662 SmallVector<Value *, 4> RHSVals;
663 SmallVector<Value *, 4> ResultVals;
664 extractValues(Builder, LHSVals, LHS);
665 extractValues(Builder, RHSVals, RHS);
666
667 IntegerType *I32Ty = Builder.getInt32Ty();
668 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
669 Type *DstTy = LHSVals[0]->getType();
670
671 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
672 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
673 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
674 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
675 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
676 Intrinsic::ID ID =
677 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
678 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
679 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680 : Builder.CreateZExtOrTrunc(Result, DstTy);
681 ResultVals.push_back(Result);
682 }
683
684 Value *NewVal = insertValues(Builder, Ty, ResultVals);
685 NewVal->takeName(&I);
686 I.replaceAllUsesWith(NewVal);
687 I.eraseFromParent();
688
689 return true;
690 }
691
692 // Find a select instruction, which may have been casted. This is mostly to deal
693 // with cases where i16 selects were promoted here to i32.
findSelectThroughCast(Value * V,CastInst * & Cast)694 static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
695 Cast = nullptr;
696 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
697 return Sel;
698
699 if ((Cast = dyn_cast<CastInst>(V))) {
700 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
701 return Sel;
702 }
703
704 return nullptr;
705 }
706
foldBinOpIntoSelect(BinaryOperator & BO) const707 bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
708 // Don't do this unless the old select is going away. We want to eliminate the
709 // binary operator, not replace a binop with a select.
710 int SelOpNo = 0;
711
712 CastInst *CastOp;
713
714 // TODO: Should probably try to handle some cases with multiple
715 // users. Duplicating the select may be profitable for division.
716 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
717 if (!Sel || !Sel->hasOneUse()) {
718 SelOpNo = 1;
719 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
720 }
721
722 if (!Sel || !Sel->hasOneUse())
723 return false;
724
725 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
726 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
727 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
728 if (!CBO || !CT || !CF)
729 return false;
730
731 if (CastOp) {
732 if (!CastOp->hasOneUse())
733 return false;
734 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
735 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
736 }
737
738 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
739 // need to handle divisions here.
740 Constant *FoldedT = SelOpNo ?
741 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
742 ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
743 if (!FoldedT || isa<ConstantExpr>(FoldedT))
744 return false;
745
746 Constant *FoldedF = SelOpNo ?
747 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
748 ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
749 if (!FoldedF || isa<ConstantExpr>(FoldedF))
750 return false;
751
752 IRBuilder<> Builder(&BO);
753 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
754 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755 Builder.setFastMathFlags(FPOp->getFastMathFlags());
756
757 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
758 FoldedT, FoldedF);
759 NewSelect->takeName(&BO);
760 BO.replaceAllUsesWith(NewSelect);
761 BO.eraseFromParent();
762 if (CastOp)
763 CastOp->eraseFromParent();
764 Sel->eraseFromParent();
765 return true;
766 }
767
768 std::pair<Value *, Value *>
getFrexpResults(IRBuilder<> & Builder,Value * Src) const769 AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
770 Value *Src) const {
771 Type *Ty = Src->getType();
772 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
773 {Ty, Builder.getInt32Ty()}, Src);
774 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
775
776 // Bypass the bug workaround for the exponent result since it doesn't matter.
777 // TODO: Does the bug workaround even really need to consider the exponent
778 // result? It's unspecified by the spec.
779
780 Value *FrexpExp =
781 ST->hasFractBug()
782 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
783 {Builder.getInt32Ty(), Ty}, Src)
784 : Builder.CreateExtractValue(Frexp, {1});
785 return {FrexpMant, FrexpExp};
786 }
787
788 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
emitRcpIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative) const789 Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
790 Value *Src,
791 bool IsNegative) const {
792 // Same as for 1.0, but expand the sign out of the constant.
793 // -1.0 / x -> rcp (fneg x)
794 if (IsNegative)
795 Src = Builder.CreateFNeg(Src);
796
797 // The rcp instruction doesn't support denormals, so scale the input
798 // out of the denormal range and convert at the end.
799 //
800 // Expand as 2^-n * (1.0 / (x * 2^n))
801
802 // TODO: Skip scaling if input is known never denormal and the input
803 // range won't underflow to denormal. The hard part is knowing the
804 // result. We need a range check, the result could be denormal for
805 // 0x1p+126 < den <= 0x1p+127.
806 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
807 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
808 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
809 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
810 }
811
812 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
emitFrexpDiv(IRBuilder<> & Builder,Value * LHS,Value * RHS,FastMathFlags FMF) const813 Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
814 Value *RHS,
815 FastMathFlags FMF) const {
816 // If we have have to work around the fract/frexp bug, we're worse off than
817 // using the fdiv.fast expansion. The full safe expansion is faster if we have
818 // fast FMA.
819 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
820 (!FMF.noNaNs() || !FMF.noInfs()))
821 return nullptr;
822
823 // We're scaling the LHS to avoid a denormal input, and scale the denominator
824 // to avoid large values underflowing the result.
825 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
826
827 Value *Rcp =
828 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
829
830 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
831 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
832
833 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
834 // result.
835 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
836 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
837 }
838
839 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
emitSqrtIEEE2ULP(IRBuilder<> & Builder,Value * Src,FastMathFlags FMF) const840 Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
841 Value *Src,
842 FastMathFlags FMF) const {
843 Type *Ty = Src->getType();
844 APFloat SmallestNormal =
845 APFloat::getSmallestNormalized(Ty->getFltSemantics());
846 Value *NeedScale =
847 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
848
849 ConstantInt *Zero = Builder.getInt32(0);
850 Value *InputScaleFactor =
851 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
852
853 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
854
855 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
856
857 Value *OutputScaleFactor =
858 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
859 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
860 }
861
862 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
emitRsqIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative)863 static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
864 bool IsNegative) {
865 // bool need_scale = x < 0x1p-126f;
866 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
867 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
868 // rsq(x * input_scale) * output_scale;
869
870 Type *Ty = Src->getType();
871 APFloat SmallestNormal =
872 APFloat::getSmallestNormalized(Ty->getFltSemantics());
873 Value *NeedScale =
874 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
875 Constant *One = ConstantFP::get(Ty, 1.0);
876 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
877 Constant *OutputScale =
878 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
879
880 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
881
882 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
883 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
884 Value *OutputScaleFactor = Builder.CreateSelect(
885 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
886
887 return Builder.CreateFMul(Rsq, OutputScaleFactor);
888 }
889
canOptimizeWithRsq(const FPMathOperator * SqrtOp,FastMathFlags DivFMF,FastMathFlags SqrtFMF) const890 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
891 FastMathFlags DivFMF,
892 FastMathFlags SqrtFMF) const {
893 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
894 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
895 return false;
896
897 // v_rsq_f32 gives 1ulp
898 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
899 SqrtOp->getFPAccuracy() >= 1.0f;
900 }
901
optimizeWithRsq(IRBuilder<> & Builder,Value * Num,Value * Den,const FastMathFlags DivFMF,const FastMathFlags SqrtFMF,const Instruction * CtxI) const902 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
903 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
904 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
905 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
906 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
907
908 // rsq_f16 is accurate to 0.51 ulp.
909 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
910 // rsq_f64 is never accurate.
911 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
912 if (!CLHS)
913 return nullptr;
914
915 assert(Den->getType()->isFloatTy());
916
917 bool IsNegative = false;
918
919 // TODO: Handle other numerator values with arcp.
920 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
921 // Add in the sqrt flags.
922 IRBuilder<>::FastMathFlagGuard Guard(Builder);
923 Builder.setFastMathFlags(DivFMF | SqrtFMF);
924
925 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
926 canIgnoreDenormalInput(Den, CtxI)) {
927 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
928 // -1.0 / sqrt(x) -> fneg(rsq(x))
929 return IsNegative ? Builder.CreateFNeg(Result) : Result;
930 }
931
932 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
933 }
934
935 return nullptr;
936 }
937
938 // Optimize fdiv with rcp:
939 //
940 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941 // allowed with unsafe-fp-math or afn.
942 //
943 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
944 Value *
optimizeWithRcp(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags FMF,const Instruction * CtxI) const945 AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
946 Value *Den, FastMathFlags FMF,
947 const Instruction *CtxI) const {
948 // rcp_f16 is accurate to 0.51 ulp.
949 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
950 // rcp_f64 is never accurate.
951 assert(Den->getType()->isFloatTy());
952
953 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
954 bool IsNegative = false;
955 if (CLHS->isExactlyValue(1.0) ||
956 (IsNegative = CLHS->isExactlyValue(-1.0))) {
957 Value *Src = Den;
958
959 if (HasFP32DenormalFlush || FMF.approxFunc()) {
960 // -1.0 / x -> 1.0 / fneg(x)
961 if (IsNegative)
962 Src = Builder.CreateFNeg(Src);
963
964 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
965 // the CI documentation has a worst case error of 1 ulp.
966 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
967 // to use it as long as we aren't trying to use denormals.
968 //
969 // v_rcp_f16 and v_rsq_f16 DO support denormals.
970
971 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
972 // insert rsq intrinsic here.
973
974 // 1.0 / x -> rcp(x)
975 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
976 }
977
978 // TODO: If the input isn't denormal, and we know the input exponent isn't
979 // big enough to introduce a denormal we can avoid the scaling.
980 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
981 }
982 }
983
984 if (FMF.allowReciprocal()) {
985 // x / y -> x * (1.0 / y)
986
987 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
988 // will never underflow.
989 if (HasFP32DenormalFlush || FMF.approxFunc()) {
990 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
991 return Builder.CreateFMul(Num, Recip);
992 }
993
994 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
995 return Builder.CreateFMul(Num, Recip);
996 }
997
998 return nullptr;
999 }
1000
1001 // optimize with fdiv.fast:
1002 //
1003 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004 //
1005 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1006 //
1007 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
optimizeWithFDivFast(IRBuilder<> & Builder,Value * Num,Value * Den,float ReqdAccuracy) const1008 Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1009 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1010 // fdiv.fast can achieve 2.5 ULP accuracy.
1011 if (ReqdAccuracy < 2.5f)
1012 return nullptr;
1013
1014 // Only have fdiv.fast for f32.
1015 assert(Den->getType()->isFloatTy());
1016
1017 bool NumIsOne = false;
1018 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1020 NumIsOne = true;
1021 }
1022
1023 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1024 //
1025 // TODO: This works for any value with a specific known exponent range, don't
1026 // just limit to constant 1.
1027 if (!HasFP32DenormalFlush && !NumIsOne)
1028 return nullptr;
1029
1030 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1031 }
1032
visitFDivElement(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags DivFMF,FastMathFlags SqrtFMF,Value * RsqOp,const Instruction * FDivInst,float ReqdDivAccuracy) const1033 Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1034 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1035 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1036 float ReqdDivAccuracy) const {
1037 if (RsqOp) {
1038 Value *Rsq =
1039 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1040 if (Rsq)
1041 return Rsq;
1042 }
1043
1044 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1045 if (Rcp)
1046 return Rcp;
1047
1048 // In the basic case fdiv_fast has the same instruction count as the frexp div
1049 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1050 // potentially be fused into a user. Also, materialization of the constants
1051 // can be reused for multiple instances.
1052 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1053 if (FDivFast)
1054 return FDivFast;
1055
1056 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1057 }
1058
1059 // Optimizations is performed based on fpmath, fast math flags as well as
1060 // denormals to optimize fdiv with either rcp or fdiv.fast.
1061 //
1062 // With rcp:
1063 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064 // allowed with unsafe-fp-math or afn.
1065 //
1066 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1067 //
1068 // With fdiv.fast:
1069 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070 //
1071 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1072 //
1073 // NOTE: rcp is the preference in cases that both are legal.
visitFDiv(BinaryOperator & FDiv)1074 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1075 if (DisableFDivExpand)
1076 return false;
1077
1078 Type *Ty = FDiv.getType()->getScalarType();
1079 if (!Ty->isFloatTy())
1080 return false;
1081
1082 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1083 // expansion around them in codegen. f16 is good enough to always use.
1084
1085 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
1086 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1087 const float ReqdAccuracy = FPOp->getFPAccuracy();
1088
1089 FastMathFlags SqrtFMF;
1090
1091 Value *Num = FDiv.getOperand(0);
1092 Value *Den = FDiv.getOperand(1);
1093
1094 Value *RsqOp = nullptr;
1095 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1096 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1097 DenII->hasOneUse()) {
1098 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1099 SqrtFMF = SqrtOp->getFastMathFlags();
1100 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1101 RsqOp = SqrtOp->getOperand(0);
1102 }
1103
1104 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105 //
1106 // Defer to codegen to handle this.
1107 //
1108 // TODO: Decide on an interpretation for interactions between afn + arcp +
1109 // !fpmath, and make it consistent between here and codegen. For now, defer
1110 // expansion of afn to codegen. The current interpretation is so aggressive we
1111 // don't need any pre-consideration here when we have better information. A
1112 // more conservative interpretation could use handling here.
1113 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1114 if (!RsqOp && AllowInaccurateRcp)
1115 return false;
1116
1117 // Defer the correct implementations to codegen.
1118 if (ReqdAccuracy < 1.0f)
1119 return false;
1120
1121 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
1122 Builder.setFastMathFlags(DivFMF);
1123 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1124
1125 SmallVector<Value *, 4> NumVals;
1126 SmallVector<Value *, 4> DenVals;
1127 SmallVector<Value *, 4> RsqDenVals;
1128 extractValues(Builder, NumVals, Num);
1129 extractValues(Builder, DenVals, Den);
1130
1131 if (RsqOp)
1132 extractValues(Builder, RsqDenVals, RsqOp);
1133
1134 SmallVector<Value *, 4> ResultVals(NumVals.size());
1135 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1136 Value *NumElt = NumVals[I];
1137 Value *DenElt = DenVals[I];
1138 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1139
1140 Value *NewElt =
1141 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1142 cast<Instruction>(FPOp), ReqdAccuracy);
1143 if (!NewElt) {
1144 // Keep the original, but scalarized.
1145
1146 // This has the unfortunate side effect of sometimes scalarizing when
1147 // we're not going to do anything.
1148 NewElt = Builder.CreateFDiv(NumElt, DenElt);
1149 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1150 NewEltInst->copyMetadata(FDiv);
1151 }
1152
1153 ResultVals[I] = NewElt;
1154 }
1155
1156 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1157
1158 if (NewVal) {
1159 FDiv.replaceAllUsesWith(NewVal);
1160 NewVal->takeName(&FDiv);
1161 RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
1162 }
1163
1164 return true;
1165 }
1166
hasUnsafeFPMath(const Function & F)1167 static bool hasUnsafeFPMath(const Function &F) {
1168 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
1169 return Attr.getValueAsBool();
1170 }
1171
getMul64(IRBuilder<> & Builder,Value * LHS,Value * RHS)1172 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1173 Value *LHS, Value *RHS) {
1174 Type *I32Ty = Builder.getInt32Ty();
1175 Type *I64Ty = Builder.getInt64Ty();
1176
1177 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1178 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1179 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1180 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1181 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1182 Hi = Builder.CreateTrunc(Hi, I32Ty);
1183 return std::pair(Lo, Hi);
1184 }
1185
getMulHu(IRBuilder<> & Builder,Value * LHS,Value * RHS)1186 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1187 return getMul64(Builder, LHS, RHS).second;
1188 }
1189
1190 /// Figure out how many bits are really needed for this division. \p AtLeast is
1191 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1192 /// first one is insufficient. Returns -1 on failure.
getDivNumBits(BinaryOperator & I,Value * Num,Value * Den,unsigned AtLeast,bool IsSigned) const1193 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1194 Value *Den, unsigned AtLeast,
1195 bool IsSigned) const {
1196 const DataLayout &DL = Mod->getDataLayout();
1197 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1198 if (LHSSignBits < AtLeast)
1199 return -1;
1200
1201 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1202 if (RHSSignBits < AtLeast)
1203 return -1;
1204
1205 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1206 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1207 if (IsSigned)
1208 ++DivBits;
1209 return DivBits;
1210 }
1211
1212 // The fractional part of a float is enough to accurately represent up to
1213 // a 24-bit signed integer.
expandDivRem24(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,bool IsDiv,bool IsSigned) const1214 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1215 BinaryOperator &I, Value *Num,
1216 Value *Den, bool IsDiv,
1217 bool IsSigned) const {
1218 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1219 // If Num bits <= 24, assume 0 signbits.
1220 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221 int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1222 if (DivBits == -1)
1223 return nullptr;
1224 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1225 }
1226
expandDivRem24Impl(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,unsigned DivBits,bool IsDiv,bool IsSigned) const1227 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1228 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1229 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1230 Type *I32Ty = Builder.getInt32Ty();
1231 Num = Builder.CreateTrunc(Num, I32Ty);
1232 Den = Builder.CreateTrunc(Den, I32Ty);
1233
1234 Type *F32Ty = Builder.getFloatTy();
1235 ConstantInt *One = Builder.getInt32(1);
1236 Value *JQ = One;
1237
1238 if (IsSigned) {
1239 // char|short jq = ia ^ ib;
1240 JQ = Builder.CreateXor(Num, Den);
1241
1242 // jq = jq >> (bitsize - 2)
1243 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1244
1245 // jq = jq | 0x1
1246 JQ = Builder.CreateOr(JQ, One);
1247 }
1248
1249 // int ia = (int)LHS;
1250 Value *IA = Num;
1251
1252 // int ib, (int)RHS;
1253 Value *IB = Den;
1254
1255 // float fa = (float)ia;
1256 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1257 : Builder.CreateUIToFP(IA, F32Ty);
1258
1259 // float fb = (float)ib;
1260 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1261 : Builder.CreateUIToFP(IB,F32Ty);
1262
1263 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
1264 Builder.getFloatTy());
1265 Value *RCP = Builder.CreateCall(RcpDecl, { FB });
1266 Value *FQM = Builder.CreateFMul(FA, RCP);
1267
1268 // fq = trunc(fqm);
1269 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1270 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1271
1272 // float fqneg = -fq;
1273 Value *FQNeg = Builder.CreateFNeg(FQ);
1274
1275 // float fr = mad(fqneg, fb, fa);
1276 auto FMAD = !ST->hasMadMacF32Insts()
1277 ? Intrinsic::fma
1278 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1279 Value *FR = Builder.CreateIntrinsic(FMAD,
1280 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1281
1282 // int iq = (int)fq;
1283 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1284 : Builder.CreateFPToUI(FQ, I32Ty);
1285
1286 // fr = fabs(fr);
1287 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1288
1289 // fb = fabs(fb);
1290 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1291
1292 // int cv = fr >= fb;
1293 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1294
1295 // jq = (cv ? jq : 0);
1296 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1297
1298 // dst = iq + jq;
1299 Value *Div = Builder.CreateAdd(IQ, JQ);
1300
1301 Value *Res = Div;
1302 if (!IsDiv) {
1303 // Rem needs compensation, it's easier to recompute it
1304 Value *Rem = Builder.CreateMul(Div, Den);
1305 Res = Builder.CreateSub(Num, Rem);
1306 }
1307
1308 if (DivBits != 0 && DivBits < 32) {
1309 // Extend in register from the number of bits this divide really is.
1310 if (IsSigned) {
1311 int InRegBits = 32 - DivBits;
1312
1313 Res = Builder.CreateShl(Res, InRegBits);
1314 Res = Builder.CreateAShr(Res, InRegBits);
1315 } else {
1316 ConstantInt *TruncMask
1317 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1318 Res = Builder.CreateAnd(Res, TruncMask);
1319 }
1320 }
1321
1322 return Res;
1323 }
1324
1325 // Try to recognize special cases the DAG will emit special, better expansions
1326 // than the general expansion we do here.
1327
1328 // TODO: It would be better to just directly handle those optimizations here.
divHasSpecialOptimization(BinaryOperator & I,Value * Num,Value * Den) const1329 bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1330 Value *Num,
1331 Value *Den) const {
1332 if (Constant *C = dyn_cast<Constant>(Den)) {
1333 // Arbitrary constants get a better expansion as long as a wider mulhi is
1334 // legal.
1335 if (C->getType()->getScalarSizeInBits() <= 32)
1336 return true;
1337
1338 // TODO: Sdiv check for not exact for some reason.
1339
1340 // If there's no wider mulhi, there's only a better expansion for powers of
1341 // two.
1342 // TODO: Should really know for each vector element.
1343 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1344 return true;
1345
1346 return false;
1347 }
1348
1349 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1350 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1351 if (BinOpDen->getOpcode() == Instruction::Shl &&
1352 isa<Constant>(BinOpDen->getOperand(0)) &&
1353 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1354 0, AC, &I, DT)) {
1355 return true;
1356 }
1357 }
1358
1359 return false;
1360 }
1361
getSign32(Value * V,IRBuilder<> & Builder,const DataLayout * DL)1362 static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1363 // Check whether the sign can be determined statically.
1364 KnownBits Known = computeKnownBits(V, *DL);
1365 if (Known.isNegative())
1366 return Constant::getAllOnesValue(V->getType());
1367 if (Known.isNonNegative())
1368 return Constant::getNullValue(V->getType());
1369 return Builder.CreateAShr(V, Builder.getInt32(31));
1370 }
1371
expandDivRem32(IRBuilder<> & Builder,BinaryOperator & I,Value * X,Value * Y) const1372 Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1373 BinaryOperator &I, Value *X,
1374 Value *Y) const {
1375 Instruction::BinaryOps Opc = I.getOpcode();
1376 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1378
1379 FastMathFlags FMF;
1380 FMF.setFast();
1381 Builder.setFastMathFlags(FMF);
1382
1383 if (divHasSpecialOptimization(I, X, Y))
1384 return nullptr; // Keep it for later optimization.
1385
1386 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1388
1389 Type *Ty = X->getType();
1390 Type *I32Ty = Builder.getInt32Ty();
1391 Type *F32Ty = Builder.getFloatTy();
1392
1393 if (Ty->getScalarSizeInBits() != 32) {
1394 if (IsSigned) {
1395 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1396 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1397 } else {
1398 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1399 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1400 }
1401 }
1402
1403 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1404 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1405 Builder.CreateZExtOrTrunc(Res, Ty);
1406 }
1407
1408 ConstantInt *Zero = Builder.getInt32(0);
1409 ConstantInt *One = Builder.getInt32(1);
1410
1411 Value *Sign = nullptr;
1412 if (IsSigned) {
1413 Value *SignX = getSign32(X, Builder, DL);
1414 Value *SignY = getSign32(Y, Builder, DL);
1415 // Remainder sign is the same as LHS
1416 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1417
1418 X = Builder.CreateAdd(X, SignX);
1419 Y = Builder.CreateAdd(Y, SignY);
1420
1421 X = Builder.CreateXor(X, SignX);
1422 Y = Builder.CreateXor(Y, SignY);
1423 }
1424
1425 // The algorithm here is based on ideas from "Software Integer Division", Tom
1426 // Rodeheffer, August 2008.
1427 //
1428 // unsigned udiv(unsigned x, unsigned y) {
1429 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1430 // // that this is a lower bound on inv(y), even if some of the calculations
1431 // // round up.
1432 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1433 //
1434 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1435 // // Empirically this is guaranteed to give a "two-y" lower bound on
1436 // // inv(y).
1437 // z += umulh(z, -y * z);
1438 //
1439 // // Quotient/remainder estimate.
1440 // unsigned q = umulh(x, z);
1441 // unsigned r = x - q * y;
1442 //
1443 // // Two rounds of quotient/remainder refinement.
1444 // if (r >= y) {
1445 // ++q;
1446 // r -= y;
1447 // }
1448 // if (r >= y) {
1449 // ++q;
1450 // r -= y;
1451 // }
1452 //
1453 // return q;
1454 // }
1455
1456 // Initial estimate of inv(y).
1457 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1458 Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1459 Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1460 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1461 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1462 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1463
1464 // One round of UNR.
1465 Value *NegY = Builder.CreateSub(Zero, Y);
1466 Value *NegYZ = Builder.CreateMul(NegY, Z);
1467 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1468
1469 // Quotient/remainder estimate.
1470 Value *Q = getMulHu(Builder, X, Z);
1471 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1472
1473 // First quotient/remainder refinement.
1474 Value *Cond = Builder.CreateICmpUGE(R, Y);
1475 if (IsDiv)
1476 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1477 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1478
1479 // Second quotient/remainder refinement.
1480 Cond = Builder.CreateICmpUGE(R, Y);
1481 Value *Res;
1482 if (IsDiv)
1483 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1484 else
1485 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1486
1487 if (IsSigned) {
1488 Res = Builder.CreateXor(Res, Sign);
1489 Res = Builder.CreateSub(Res, Sign);
1490 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1491 } else {
1492 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1493 }
1494 return Res;
1495 }
1496
shrinkDivRem64(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den) const1497 Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1498 BinaryOperator &I, Value *Num,
1499 Value *Den) const {
1500 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1501 return nullptr; // Keep it for later optimization.
1502
1503 Instruction::BinaryOps Opc = I.getOpcode();
1504
1505 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1507
1508 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1509 if (NumDivBits == -1)
1510 return nullptr;
1511
1512 Value *Narrowed = nullptr;
1513 if (NumDivBits <= 24) {
1514 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1515 IsDiv, IsSigned);
1516 } else if (NumDivBits <= 32) {
1517 Narrowed = expandDivRem32(Builder, I, Num, Den);
1518 }
1519
1520 if (Narrowed) {
1521 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1522 Builder.CreateZExt(Narrowed, Num->getType());
1523 }
1524
1525 return nullptr;
1526 }
1527
expandDivRem64(BinaryOperator & I) const1528 void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1529 Instruction::BinaryOps Opc = I.getOpcode();
1530 // Do the general expansion.
1531 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1532 expandDivisionUpTo64Bits(&I);
1533 return;
1534 }
1535
1536 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1537 expandRemainderUpTo64Bits(&I);
1538 return;
1539 }
1540
1541 llvm_unreachable("not a division");
1542 }
1543
visitBinaryOperator(BinaryOperator & I)1544 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1545 if (foldBinOpIntoSelect(I))
1546 return true;
1547
1548 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1549 UA->isUniform(&I) && promoteUniformOpToI32(I))
1550 return true;
1551
1552 if (UseMul24Intrin && replaceMulWithMul24(I))
1553 return true;
1554
1555 bool Changed = false;
1556 Instruction::BinaryOps Opc = I.getOpcode();
1557 Type *Ty = I.getType();
1558 Value *NewDiv = nullptr;
1559 unsigned ScalarSize = Ty->getScalarSizeInBits();
1560
1561 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1562
1563 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1565 ScalarSize <= 64 &&
1566 !DisableIDivExpand) {
1567 Value *Num = I.getOperand(0);
1568 Value *Den = I.getOperand(1);
1569 IRBuilder<> Builder(&I);
1570 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1571
1572 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1573 NewDiv = PoisonValue::get(VT);
1574
1575 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1576 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1577 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1578
1579 Value *NewElt;
1580 if (ScalarSize <= 32) {
1581 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1582 if (!NewElt)
1583 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1584 } else {
1585 // See if this 64-bit division can be shrunk to 32/24-bits before
1586 // producing the general expansion.
1587 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1588 if (!NewElt) {
1589 // The general 64-bit expansion introduces control flow and doesn't
1590 // return the new value. Just insert a scalar copy and defer
1591 // expanding it.
1592 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1593 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1594 }
1595 }
1596
1597 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1598 NewEltI->copyIRFlags(&I);
1599
1600 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1601 }
1602 } else {
1603 if (ScalarSize <= 32)
1604 NewDiv = expandDivRem32(Builder, I, Num, Den);
1605 else {
1606 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1607 if (!NewDiv)
1608 Div64ToExpand.push_back(&I);
1609 }
1610 }
1611
1612 if (NewDiv) {
1613 I.replaceAllUsesWith(NewDiv);
1614 I.eraseFromParent();
1615 Changed = true;
1616 }
1617 }
1618
1619 if (ExpandDiv64InIR) {
1620 // TODO: We get much worse code in specially handled constant cases.
1621 for (BinaryOperator *Div : Div64ToExpand) {
1622 expandDivRem64(*Div);
1623 FlowChanged = true;
1624 Changed = true;
1625 }
1626 }
1627
1628 return Changed;
1629 }
1630
visitLoadInst(LoadInst & I)1631 bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1632 if (!WidenLoads)
1633 return false;
1634
1635 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1636 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1637 canWidenScalarExtLoad(I)) {
1638 IRBuilder<> Builder(&I);
1639 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1640
1641 Type *I32Ty = Builder.getInt32Ty();
1642 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1643 WidenLoad->copyMetadata(I);
1644
1645 // If we have range metadata, we need to convert the type, and not make
1646 // assumptions about the high bits.
1647 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1648 ConstantInt *Lower =
1649 mdconst::extract<ConstantInt>(Range->getOperand(0));
1650
1651 if (Lower->isNullValue()) {
1652 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1653 } else {
1654 Metadata *LowAndHigh[] = {
1655 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1656 // Don't make assumptions about the high bits.
1657 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1658 };
1659
1660 WidenLoad->setMetadata(LLVMContext::MD_range,
1661 MDNode::get(Mod->getContext(), LowAndHigh));
1662 }
1663 }
1664
1665 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1666 Type *IntNTy = Builder.getIntNTy(TySize);
1667 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1668 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1669 I.replaceAllUsesWith(ValOrig);
1670 I.eraseFromParent();
1671 return true;
1672 }
1673
1674 return false;
1675 }
1676
visitICmpInst(ICmpInst & I)1677 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1678 bool Changed = false;
1679
1680 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1681 UA->isUniform(&I))
1682 Changed |= promoteUniformOpToI32(I);
1683
1684 return Changed;
1685 }
1686
visitSelectInst(SelectInst & I)1687 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1688 Value *Cond = I.getCondition();
1689 Value *TrueVal = I.getTrueValue();
1690 Value *FalseVal = I.getFalseValue();
1691 Value *CmpVal;
1692 FCmpInst::Predicate Pred;
1693
1694 if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
1695 if (UA->isUniform(&I))
1696 return promoteUniformOpToI32(I);
1697 return false;
1698 }
1699
1700 // Match fract pattern with nan check.
1701 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1702 return false;
1703
1704 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1705 if (!FPOp)
1706 return false;
1707
1708 IRBuilder<> Builder(&I);
1709 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1710
1711 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1712 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1713
1714 Value *Fract = nullptr;
1715 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1716 CmpVal == matchFractPat(*IIFalse)) {
1717 // isnan(x) ? x : fract(x)
1718 Fract = applyFractPat(Builder, CmpVal);
1719 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1720 CmpVal == matchFractPat(*IITrue)) {
1721 // !isnan(x) ? fract(x) : x
1722 Fract = applyFractPat(Builder, CmpVal);
1723 } else
1724 return false;
1725
1726 Fract->takeName(&I);
1727 I.replaceAllUsesWith(Fract);
1728 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
1729 return true;
1730 }
1731
areInSameBB(const Value * A,const Value * B)1732 static bool areInSameBB(const Value *A, const Value *B) {
1733 const auto *IA = dyn_cast<Instruction>(A);
1734 const auto *IB = dyn_cast<Instruction>(B);
1735 return IA && IB && IA->getParent() == IB->getParent();
1736 }
1737
1738 // Helper for breaking large PHIs that returns true when an extractelement on V
1739 // is likely to be folded away by the DAG combiner.
isInterestingPHIIncomingValue(const Value * V)1740 static bool isInterestingPHIIncomingValue(const Value *V) {
1741 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1742 if (!FVT)
1743 return false;
1744
1745 const Value *CurVal = V;
1746
1747 // Check for insertelements, keeping track of the elements covered.
1748 BitVector EltsCovered(FVT->getNumElements());
1749 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1750 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1751
1752 // Non constant index/out of bounds index -> folding is unlikely.
1753 // The latter is more of a sanity check because canonical IR should just
1754 // have replaced those with poison.
1755 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1756 return false;
1757
1758 const auto *VecSrc = IE->getOperand(0);
1759
1760 // If the vector source is another instruction, it must be in the same basic
1761 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1762 // unlikely to be able to do anything interesting here.
1763 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1764 return false;
1765
1766 CurVal = VecSrc;
1767 EltsCovered.set(Idx->getZExtValue());
1768
1769 // All elements covered.
1770 if (EltsCovered.all())
1771 return true;
1772 }
1773
1774 // We either didn't find a single insertelement, or the insertelement chain
1775 // ended before all elements were covered. Check for other interesting values.
1776
1777 // Constants are always interesting because we can just constant fold the
1778 // extractelements.
1779 if (isa<Constant>(CurVal))
1780 return true;
1781
1782 // shufflevector is likely to be profitable if either operand is a constant,
1783 // or if either source is in the same block.
1784 // This is because shufflevector is most often lowered as a series of
1785 // insert/extract elements anyway.
1786 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1787 return isa<Constant>(SV->getOperand(1)) ||
1788 areInSameBB(SV, SV->getOperand(0)) ||
1789 areInSameBB(SV, SV->getOperand(1));
1790 }
1791
1792 return false;
1793 }
1794
collectPHINodes(const PHINode & I,SmallPtrSet<const PHINode *,8> & SeenPHIs)1795 static void collectPHINodes(const PHINode &I,
1796 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1797 const auto [It, Inserted] = SeenPHIs.insert(&I);
1798 if (!Inserted)
1799 return;
1800
1801 for (const Value *Inc : I.incoming_values()) {
1802 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1803 collectPHINodes(*PhiInc, SeenPHIs);
1804 }
1805
1806 for (const User *U : I.users()) {
1807 if (const auto *PhiU = dyn_cast<PHINode>(U))
1808 collectPHINodes(*PhiU, SeenPHIs);
1809 }
1810 }
1811
canBreakPHINode(const PHINode & I)1812 bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1813 // Check in the cache first.
1814 if (const auto It = BreakPhiNodesCache.find(&I);
1815 It != BreakPhiNodesCache.end())
1816 return It->second;
1817
1818 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1819 // recursively consider all its users and incoming values that are also PHI
1820 // nodes. We then make a decision about all of those PHIs at once. Either they
1821 // all get broken up, or none of them do. That way, we avoid cases where a
1822 // single PHI is/is not broken and we end up reforming/exploding a vector
1823 // multiple times, or even worse, doing it in a loop.
1824 SmallPtrSet<const PHINode *, 8> WorkList;
1825 collectPHINodes(I, WorkList);
1826
1827 #ifndef NDEBUG
1828 // Check that none of the PHI nodes in the worklist are in the map. If some of
1829 // them are, it means we're not good enough at collecting related PHIs.
1830 for (const PHINode *WLP : WorkList) {
1831 assert(BreakPhiNodesCache.count(WLP) == 0);
1832 }
1833 #endif
1834
1835 // To consider a PHI profitable to break, we need to see some interesting
1836 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1837 // must have one to consider all PHIs breakable.
1838 //
1839 // This threshold has been determined through performance testing.
1840 //
1841 // Note that the computation below is equivalent to
1842 //
1843 // (unsigned)ceil((K / 3.0) * 2)
1844 //
1845 // It's simply written this way to avoid mixing integral/FP arithmetic.
1846 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1847 unsigned NumBreakablePHIs = 0;
1848 bool CanBreak = false;
1849 for (const PHINode *Cur : WorkList) {
1850 // Don't break PHIs that have no interesting incoming values. That is, where
1851 // there is no clear opportunity to fold the "extractelement" instructions
1852 // we would add.
1853 //
1854 // Note: IC does not run after this pass, so we're only interested in the
1855 // foldings that the DAG combiner can do.
1856 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1857 if (++NumBreakablePHIs >= Threshold) {
1858 CanBreak = true;
1859 break;
1860 }
1861 }
1862 }
1863
1864 for (const PHINode *Cur : WorkList)
1865 BreakPhiNodesCache[Cur] = CanBreak;
1866
1867 return CanBreak;
1868 }
1869
1870 /// Helper class for "break large PHIs" (visitPHINode).
1871 ///
1872 /// This represents a slice of a PHI's incoming value, which is made up of:
1873 /// - The type of the slice (Ty)
1874 /// - The index in the incoming value's vector where the slice starts (Idx)
1875 /// - The number of elements in the slice (NumElts).
1876 /// It also keeps track of the NewPHI node inserted for this particular slice.
1877 ///
1878 /// Slice examples:
1879 /// <4 x i64> -> Split into four i64 slices.
1880 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1881 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1882 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1883 class VectorSlice {
1884 public:
VectorSlice(Type * Ty,unsigned Idx,unsigned NumElts)1885 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1886 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1887
1888 Type *Ty = nullptr;
1889 unsigned Idx = 0;
1890 unsigned NumElts = 0;
1891 PHINode *NewPHI = nullptr;
1892
1893 /// Slice \p Inc according to the information contained within this slice.
1894 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1895 /// pair, it returns the same Sliced value as well.
1896 ///
1897 /// Note this *intentionally* does not return the same value for, say,
1898 /// [%bb.0, %0] & [%bb.1, %0] as:
1899 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1900 /// the value in bb.1 may not be reachable from bb.0 if it's its
1901 /// predecessor.)
1902 /// - We also want to make our extract instructions as local as possible so
1903 /// the DAG has better chances of folding them out. Duplicating them like
1904 /// that is beneficial in that regard.
1905 ///
1906 /// This is both a minor optimization to avoid creating duplicate
1907 /// instructions, but also a requirement for correctness. It is not forbidden
1908 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1909 /// returned a new value each time, those previously identical pairs would all
1910 /// have different incoming values (from the same block) and it'd cause a "PHI
1911 /// node has multiple entries for the same basic block with different incoming
1912 /// values!" verifier error.
getSlicedVal(BasicBlock * BB,Value * Inc,StringRef NewValName)1913 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1914 Value *&Res = SlicedVals[{BB, Inc}];
1915 if (Res)
1916 return Res;
1917
1918 IRBuilder<> B(BB->getTerminator());
1919 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1920 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1921
1922 if (NumElts > 1) {
1923 SmallVector<int, 4> Mask;
1924 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1925 Mask.push_back(K);
1926 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1927 } else
1928 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1929
1930 return Res;
1931 }
1932
1933 private:
1934 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1935 };
1936
visitPHINode(PHINode & I)1937 bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1938 // Break-up fixed-vector PHIs into smaller pieces.
1939 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1940 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1941 //
1942 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1943 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1944 // With large, odd-sized PHIs we may end up needing many `build_vector`
1945 // operations with most elements being "undef". This inhibits a lot of
1946 // optimization opportunities and can result in unreasonably high register
1947 // pressure and the inevitable stack spilling.
1948 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1949 return false;
1950
1951 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1952 if (!FVT || FVT->getNumElements() == 1 ||
1953 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1954 return false;
1955
1956 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1957 return false;
1958
1959 std::vector<VectorSlice> Slices;
1960
1961 Type *EltTy = FVT->getElementType();
1962 {
1963 unsigned Idx = 0;
1964 // For 8/16 bits type, don't scalarize fully but break it up into as many
1965 // 32-bit slices as we can, and scalarize the tail.
1966 const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
1967 const unsigned NumElts = FVT->getNumElements();
1968 if (EltSize == 8 || EltSize == 16) {
1969 const unsigned SubVecSize = (32 / EltSize);
1970 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1971 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1972 Idx += SubVecSize)
1973 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1974 }
1975
1976 // Scalarize all remaining elements.
1977 for (; Idx < NumElts; ++Idx)
1978 Slices.emplace_back(EltTy, Idx, 1);
1979 }
1980
1981 assert(Slices.size() > 1);
1982
1983 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1984 // creating the necessary instruction to extract the relevant slices of each
1985 // incoming value.
1986 IRBuilder<> B(I.getParent());
1987 B.SetCurrentDebugLocation(I.getDebugLoc());
1988
1989 unsigned IncNameSuffix = 0;
1990 for (VectorSlice &S : Slices) {
1991 // We need to reset the build on each iteration, because getSlicedVal may
1992 // have inserted something into I's BB.
1993 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1994 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1995
1996 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1997 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1998 "largephi.extractslice" +
1999 std::to_string(IncNameSuffix++)),
2000 BB);
2001 }
2002 }
2003
2004 // And replace this PHI with a vector of all the previous PHI values.
2005 Value *Vec = PoisonValue::get(FVT);
2006 unsigned NameSuffix = 0;
2007 for (VectorSlice &S : Slices) {
2008 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
2009 if (S.NumElts > 1)
2010 Vec =
2011 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
2012 else
2013 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2014 }
2015
2016 I.replaceAllUsesWith(Vec);
2017 I.eraseFromParent();
2018 return true;
2019 }
2020
2021 /// \param V Value to check
2022 /// \param DL DataLayout
2023 /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2024 /// \param AS Target Address Space
2025 /// \return true if \p V cannot be the null value of \p AS, false otherwise.
isPtrKnownNeverNull(const Value * V,const DataLayout & DL,const AMDGPUTargetMachine & TM,unsigned AS)2026 static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2027 const AMDGPUTargetMachine &TM, unsigned AS) {
2028 // Pointer cannot be null if it's a block address, GV or alloca.
2029 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2030 // it as the symbol could be null in such cases.
2031 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2032 return true;
2033
2034 // Check nonnull arguments.
2035 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2036 return true;
2037
2038 // getUnderlyingObject may have looked through another addrspacecast, although
2039 // the optimizable situations most likely folded out by now.
2040 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2041 return false;
2042
2043 // TODO: Calls that return nonnull?
2044
2045 // For all other things, use KnownBits.
2046 // We either use 0 or all bits set to indicate null, so check whether the
2047 // value can be zero or all ones.
2048 //
2049 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2050 // address spaces have non-zero null values.
2051 auto SrcPtrKB = computeKnownBits(V, DL);
2052 const auto NullVal = TM.getNullPointerValue(AS);
2053
2054 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2055 assert((NullVal == 0 || NullVal == -1) &&
2056 "don't know how to check for this null value!");
2057 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2058 }
2059
visitAddrSpaceCastInst(AddrSpaceCastInst & I)2060 bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2061 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2062 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2063 // worth supporting.
2064 if (I.getType()->isVectorTy())
2065 return false;
2066
2067 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2068 // This is only worthwhile for casts from/to priv/local to flat.
2069 const unsigned SrcAS = I.getSrcAddressSpace();
2070 const unsigned DstAS = I.getDestAddressSpace();
2071
2072 bool CanLower = false;
2073 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2074 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2075 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2076 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2077 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2078 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2079 if (!CanLower)
2080 return false;
2081
2082 SmallVector<const Value *, 4> WorkList;
2083 getUnderlyingObjects(I.getOperand(0), WorkList);
2084 if (!all_of(WorkList, [&](const Value *V) {
2085 return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2086 }))
2087 return false;
2088
2089 IRBuilder<> B(&I);
2090 auto *Intrin = B.CreateIntrinsic(
2091 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2092 I.replaceAllUsesWith(Intrin);
2093 I.eraseFromParent();
2094 return true;
2095 }
2096
visitIntrinsicInst(IntrinsicInst & I)2097 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2098 switch (I.getIntrinsicID()) {
2099 case Intrinsic::bitreverse:
2100 return visitBitreverseIntrinsicInst(I);
2101 case Intrinsic::minnum:
2102 return visitMinNum(I);
2103 case Intrinsic::sqrt:
2104 return visitSqrt(I);
2105 default:
2106 return false;
2107 }
2108 }
2109
visitBitreverseIntrinsicInst(IntrinsicInst & I)2110 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2111 bool Changed = false;
2112
2113 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
2114 UA->isUniform(&I))
2115 Changed |= promoteUniformBitreverseToI32(I);
2116
2117 return Changed;
2118 }
2119
2120 /// Match non-nan fract pattern.
2121 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2122 ///
2123 /// If fract is a useful instruction for the subtarget. Does not account for the
2124 /// nan handling; the instruction has a nan check on the input value.
matchFractPat(IntrinsicInst & I)2125 Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2126 if (ST->hasFractBug())
2127 return nullptr;
2128
2129 if (I.getIntrinsicID() != Intrinsic::minnum)
2130 return nullptr;
2131
2132 Type *Ty = I.getType();
2133 if (!isLegalFloatingTy(Ty->getScalarType()))
2134 return nullptr;
2135
2136 Value *Arg0 = I.getArgOperand(0);
2137 Value *Arg1 = I.getArgOperand(1);
2138
2139 const APFloat *C;
2140 if (!match(Arg1, m_APFloat(C)))
2141 return nullptr;
2142
2143 APFloat One(1.0);
2144 bool LosesInfo;
2145 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2146
2147 // Match nextafter(1.0, -1)
2148 One.next(true);
2149 if (One != *C)
2150 return nullptr;
2151
2152 Value *FloorSrc;
2153 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2154 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2155 return FloorSrc;
2156 return nullptr;
2157 }
2158
applyFractPat(IRBuilder<> & Builder,Value * FractArg)2159 Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2160 Value *FractArg) {
2161 SmallVector<Value *, 4> FractVals;
2162 extractValues(Builder, FractVals, FractArg);
2163
2164 SmallVector<Value *, 4> ResultVals(FractVals.size());
2165
2166 Type *Ty = FractArg->getType()->getScalarType();
2167 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2168 ResultVals[I] =
2169 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2170 }
2171
2172 return insertValues(Builder, FractArg->getType(), ResultVals);
2173 }
2174
visitMinNum(IntrinsicInst & I)2175 bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2176 Value *FractArg = matchFractPat(I);
2177 if (!FractArg)
2178 return false;
2179
2180 // Match pattern for fract intrinsic in contexts where the nan check has been
2181 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2182 if (!I.hasNoNaNs() &&
2183 !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
2184 return false;
2185
2186 IRBuilder<> Builder(&I);
2187 FastMathFlags FMF = I.getFastMathFlags();
2188 FMF.setNoNaNs();
2189 Builder.setFastMathFlags(FMF);
2190
2191 Value *Fract = applyFractPat(Builder, FractArg);
2192 Fract->takeName(&I);
2193 I.replaceAllUsesWith(Fract);
2194
2195 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
2196 return true;
2197 }
2198
isOneOrNegOne(const Value * Val)2199 static bool isOneOrNegOne(const Value *Val) {
2200 const APFloat *C;
2201 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2202 }
2203
2204 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
visitSqrt(IntrinsicInst & Sqrt)2205 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2206 Type *Ty = Sqrt.getType()->getScalarType();
2207 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2208 return false;
2209
2210 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2211 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2212
2213 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2214 // of fast llvm.sqrt will give the raw instruction anyway.
2215 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2216 return false;
2217
2218 const float ReqdAccuracy = FPOp->getFPAccuracy();
2219
2220 // Defer correctly rounded expansion to codegen.
2221 if (ReqdAccuracy < 1.0f)
2222 return false;
2223
2224 // FIXME: This is an ugly hack for this pass using forward iteration instead
2225 // of reverse. If it worked like a normal combiner, the rsq would form before
2226 // we saw a sqrt call.
2227 auto *FDiv =
2228 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2229 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2230 FDiv->getFPAccuracy() >= 1.0f &&
2231 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2232 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2233 isOneOrNegOne(FDiv->getOperand(0)))
2234 return false;
2235
2236 Value *SrcVal = Sqrt.getOperand(0);
2237 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2238
2239 // The raw instruction is 1 ulp, but the correction for denormal handling
2240 // brings it to 2.
2241 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2242 return false;
2243
2244 IRBuilder<> Builder(&Sqrt);
2245 SmallVector<Value *, 4> SrcVals;
2246 extractValues(Builder, SrcVals, SrcVal);
2247
2248 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2249 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2250 if (CanTreatAsDAZ)
2251 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2252 else
2253 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2254 }
2255
2256 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2257 NewSqrt->takeName(&Sqrt);
2258 Sqrt.replaceAllUsesWith(NewSqrt);
2259 Sqrt.eraseFromParent();
2260 return true;
2261 }
2262
doInitialization(Module & M)2263 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
2264 Impl.Mod = &M;
2265 Impl.DL = &Impl.Mod->getDataLayout();
2266 Impl.SqrtF32 = nullptr;
2267 Impl.LdexpF32 = nullptr;
2268 return false;
2269 }
2270
runOnFunction(Function & F)2271 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2272 if (skipFunction(F))
2273 return false;
2274
2275 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2276 if (!TPC)
2277 return false;
2278
2279 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2280 Impl.TM = &TM;
2281 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2282 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2283 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2284 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2285 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2286 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
2287 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2288 SIModeRegisterDefaults Mode(F, *Impl.ST);
2289 Impl.HasFP32DenormalFlush =
2290 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2291 return Impl.run(F);
2292 }
2293
run(Function & F,FunctionAnalysisManager & FAM)2294 PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2295 FunctionAnalysisManager &FAM) {
2296 AMDGPUCodeGenPrepareImpl Impl;
2297 Impl.Mod = F.getParent();
2298 Impl.DL = &Impl.Mod->getDataLayout();
2299 Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
2300 Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
2301 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2302 Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
2303 Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
2304 Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2305 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2306 SIModeRegisterDefaults Mode(F, *Impl.ST);
2307 Impl.HasFP32DenormalFlush =
2308 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2309 PreservedAnalyses PA = PreservedAnalyses::none();
2310 if (!Impl.FlowChanged)
2311 PA.preserveSet<CFGAnalyses>();
2312 return Impl.run(F) ? PA : PreservedAnalyses::all();
2313 }
2314
2315 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2316 "AMDGPU IR optimizations", false, false)
2317 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2318 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2319 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2320 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2321 false, false)
2322
2323 char AMDGPUCodeGenPrepare::ID = 0;
2324
createAMDGPUCodeGenPreparePass()2325 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2326 return new AMDGPUCodeGenPrepare();
2327 }
2328