1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIModeRegisterDefaults.h" 18 #include "llvm/Analysis/AssumptionCache.h" 19 #include "llvm/Analysis/ConstantFolding.h" 20 #include "llvm/Analysis/TargetLibraryInfo.h" 21 #include "llvm/Analysis/UniformityAnalysis.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/Dominators.h" 25 #include "llvm/IR/IRBuilder.h" 26 #include "llvm/IR/InstVisitor.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/PatternMatch.h" 29 #include "llvm/InitializePasses.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Support/KnownBits.h" 32 #include "llvm/Transforms/Utils/IntegerDivision.h" 33 #include "llvm/Transforms/Utils/Local.h" 34 35 #define DEBUG_TYPE "amdgpu-codegenprepare" 36 37 using namespace llvm; 38 using namespace llvm::PatternMatch; 39 40 namespace { 41 42 static cl::opt<bool> WidenLoads( 43 "amdgpu-codegenprepare-widen-constant-loads", 44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 45 cl::ReallyHidden, 46 cl::init(false)); 47 48 static cl::opt<bool> Widen16BitOps( 49 "amdgpu-codegenprepare-widen-16-bit-ops", 50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), 51 cl::ReallyHidden, 52 cl::init(true)); 53 54 static cl::opt<bool> 55 ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis", 56 cl::desc("Break large PHI nodes for DAGISel"), 57 cl::ReallyHidden, cl::init(true)); 58 59 static cl::opt<bool> 60 ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis", 61 cl::desc("For testing purposes, always break large " 62 "PHIs even if it isn't profitable."), 63 cl::ReallyHidden, cl::init(false)); 64 65 static cl::opt<unsigned> ScalarizeLargePHIsThreshold( 66 "amdgpu-codegenprepare-break-large-phis-threshold", 67 cl::desc("Minimum type size in bits for breaking large PHI nodes"), 68 cl::ReallyHidden, cl::init(32)); 69 70 static cl::opt<bool> UseMul24Intrin( 71 "amdgpu-codegenprepare-mul24", 72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), 73 cl::ReallyHidden, 74 cl::init(true)); 75 76 // Legalize 64-bit division by using the generic IR expansion. 77 static cl::opt<bool> ExpandDiv64InIR( 78 "amdgpu-codegenprepare-expand-div64", 79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), 80 cl::ReallyHidden, 81 cl::init(false)); 82 83 // Leave all division operations as they are. This supersedes ExpandDiv64InIR 84 // and is used for testing the legalizer. 85 static cl::opt<bool> DisableIDivExpand( 86 "amdgpu-codegenprepare-disable-idiv-expansion", 87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), 88 cl::ReallyHidden, 89 cl::init(false)); 90 91 // Disable processing of fdiv so we can better test the backend implementations. 92 static cl::opt<bool> DisableFDivExpand( 93 "amdgpu-codegenprepare-disable-fdiv-expansion", 94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"), 95 cl::ReallyHidden, 96 cl::init(false)); 97 98 class AMDGPUCodeGenPrepareImpl 99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { 100 public: 101 const GCNSubtarget *ST = nullptr; 102 const TargetLibraryInfo *TLInfo = nullptr; 103 AssumptionCache *AC = nullptr; 104 DominatorTree *DT = nullptr; 105 UniformityInfo *UA = nullptr; 106 Module *Mod = nullptr; 107 const DataLayout *DL = nullptr; 108 bool HasUnsafeFPMath = false; 109 bool HasFP32DenormalFlush = false; 110 bool FlowChanged = false; 111 112 DenseMap<const PHINode *, bool> BreakPhiNodesCache; 113 114 bool canBreakPHINode(const PHINode &I); 115 116 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 117 /// binary operation \p V. 118 /// 119 /// \returns Binary operation \p V. 120 /// \returns \p T's base element bit width. 121 unsigned getBaseElementBitWidth(const Type *T) const; 122 123 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 124 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 125 /// is returned. 126 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 127 128 /// \returns True if binary operation \p I is a signed binary operation, false 129 /// otherwise. 130 bool isSigned(const BinaryOperator &I) const; 131 132 /// \returns True if the condition of 'select' operation \p I comes from a 133 /// signed 'icmp' operation, false otherwise. 134 bool isSigned(const SelectInst &I) const; 135 136 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 137 /// false otherwise. 138 bool needsPromotionToI32(const Type *T) const; 139 140 /// Return true if \p T is a legal scalar floating point type. 141 bool isLegalFloatingTy(const Type *T) const; 142 143 /// Wrapper to pass all the arguments to computeKnownFPClass 144 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, 145 const Instruction *CtxI) const { 146 return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI, 147 DT); 148 } 149 150 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { 151 return HasFP32DenormalFlush || 152 computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); 153 } 154 155 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 156 /// operation. 157 /// 158 /// \details \p I's base element bit width must be greater than 1 and less 159 /// than or equal 16. Promotion is done by sign or zero extending operands to 160 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 161 /// truncating the result of 32 bit binary operation back to \p I's original 162 /// type. Division operation is not promoted. 163 /// 164 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 165 /// false otherwise. 166 bool promoteUniformOpToI32(BinaryOperator &I) const; 167 168 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 169 /// 170 /// \details \p I's base element bit width must be greater than 1 and less 171 /// than or equal 16. Promotion is done by sign or zero extending operands to 172 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 173 /// 174 /// \returns True. 175 bool promoteUniformOpToI32(ICmpInst &I) const; 176 177 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 178 /// operation. 179 /// 180 /// \details \p I's base element bit width must be greater than 1 and less 181 /// than or equal 16. Promotion is done by sign or zero extending operands to 182 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 183 /// result of 32 bit 'select' operation back to \p I's original type. 184 /// 185 /// \returns True. 186 bool promoteUniformOpToI32(SelectInst &I) const; 187 188 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 189 /// intrinsic. 190 /// 191 /// \details \p I's base element bit width must be greater than 1 and less 192 /// than or equal 16. Promotion is done by zero extending the operand to 32 193 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 194 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 195 /// shift amount is 32 minus \p I's base element bit width), and truncating 196 /// the result of the shift operation back to \p I's original type. 197 /// 198 /// \returns True. 199 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 200 201 /// \returns The minimum number of bits needed to store the value of \Op as an 202 /// unsigned integer. Truncating to this size and then zero-extending to 203 /// the original will not change the value. 204 unsigned numBitsUnsigned(Value *Op) const; 205 206 /// \returns The minimum number of bits needed to store the value of \Op as a 207 /// signed integer. Truncating to this size and then sign-extending to 208 /// the original size will not change the value. 209 unsigned numBitsSigned(Value *Op) const; 210 211 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. 212 /// SelectionDAG has an issue where an and asserting the bits are known 213 bool replaceMulWithMul24(BinaryOperator &I) const; 214 215 /// Perform same function as equivalently named function in DAGCombiner. Since 216 /// we expand some divisions here, we need to perform this before obscuring. 217 bool foldBinOpIntoSelect(BinaryOperator &I) const; 218 219 bool divHasSpecialOptimization(BinaryOperator &I, 220 Value *Num, Value *Den) const; 221 int getDivNumBits(BinaryOperator &I, 222 Value *Num, Value *Den, 223 unsigned AtLeast, bool Signed) const; 224 225 /// Expands 24 bit div or rem. 226 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 227 Value *Num, Value *Den, 228 bool IsDiv, bool IsSigned) const; 229 230 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, 231 Value *Num, Value *Den, unsigned NumBits, 232 bool IsDiv, bool IsSigned) const; 233 234 /// Expands 32 bit div or rem. 235 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 236 Value *Num, Value *Den) const; 237 238 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, 239 Value *Num, Value *Den) const; 240 void expandDivRem64(BinaryOperator &I) const; 241 242 /// Widen a scalar load. 243 /// 244 /// \details \p Widen scalar load for uniform, small type loads from constant 245 // memory / to a full 32-bits and then truncate the input to allow a scalar 246 // load instead of a vector load. 247 // 248 /// \returns True. 249 250 bool canWidenScalarExtLoad(LoadInst &I) const; 251 252 Value *matchFractPat(IntrinsicInst &I); 253 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); 254 255 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, 256 FastMathFlags SqrtFMF) const; 257 258 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, 259 FastMathFlags DivFMF, FastMathFlags SqrtFMF, 260 const Instruction *CtxI) const; 261 262 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den, 263 FastMathFlags FMF, const Instruction *CtxI) const; 264 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den, 265 float ReqdAccuracy) const; 266 267 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den, 268 FastMathFlags DivFMF, FastMathFlags SqrtFMF, 269 Value *RsqOp, const Instruction *FDiv, 270 float ReqdAccuracy) const; 271 272 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder, 273 Value *Src) const; 274 275 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, 276 bool IsNegative) const; 277 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, 278 FastMathFlags FMF) const; 279 280 public: 281 bool visitFDiv(BinaryOperator &I); 282 283 bool visitInstruction(Instruction &I) { return false; } 284 bool visitBinaryOperator(BinaryOperator &I); 285 bool visitLoadInst(LoadInst &I); 286 bool visitICmpInst(ICmpInst &I); 287 bool visitSelectInst(SelectInst &I); 288 bool visitPHINode(PHINode &I); 289 290 bool visitIntrinsicInst(IntrinsicInst &I); 291 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 292 bool visitMinNum(IntrinsicInst &I); 293 bool run(Function &F); 294 }; 295 296 class AMDGPUCodeGenPrepare : public FunctionPass { 297 private: 298 AMDGPUCodeGenPrepareImpl Impl; 299 300 public: 301 static char ID; 302 AMDGPUCodeGenPrepare() : FunctionPass(ID) { 303 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry()); 304 } 305 void getAnalysisUsage(AnalysisUsage &AU) const override { 306 AU.addRequired<AssumptionCacheTracker>(); 307 AU.addRequired<UniformityInfoWrapperPass>(); 308 AU.addRequired<TargetLibraryInfoWrapperPass>(); 309 310 // FIXME: Division expansion needs to preserve the dominator tree. 311 if (!ExpandDiv64InIR) 312 AU.setPreservesAll(); 313 } 314 bool runOnFunction(Function &F) override; 315 bool doInitialization(Module &M) override; 316 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 317 }; 318 319 } // end anonymous namespace 320 321 bool AMDGPUCodeGenPrepareImpl::run(Function &F) { 322 bool MadeChange = false; 323 324 Function::iterator NextBB; 325 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { 326 BasicBlock *BB = &*FI; 327 NextBB = std::next(FI); 328 329 BasicBlock::iterator Next; 330 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; 331 I = Next) { 332 Next = std::next(I); 333 334 MadeChange |= visit(*I); 335 336 if (Next != E) { // Control flow changed 337 BasicBlock *NextInstBB = Next->getParent(); 338 if (NextInstBB != BB) { 339 BB = NextInstBB; 340 E = BB->end(); 341 FE = F.end(); 342 } 343 } 344 } 345 } 346 return MadeChange; 347 } 348 349 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { 350 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 351 352 if (T->isIntegerTy()) 353 return T->getIntegerBitWidth(); 354 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 355 } 356 357 Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { 358 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 359 360 if (T->isIntegerTy()) 361 return B.getInt32Ty(); 362 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); 363 } 364 365 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { 366 return I.getOpcode() == Instruction::AShr || 367 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 368 } 369 370 bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { 371 return isa<ICmpInst>(I.getOperand(0)) ? 372 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 373 } 374 375 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { 376 if (!Widen16BitOps) 377 return false; 378 379 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 380 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 381 return true; 382 383 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 384 // TODO: The set of packed operations is more limited, so may want to 385 // promote some anyway. 386 if (ST->hasVOP3PInsts()) 387 return false; 388 389 return needsPromotionToI32(VT->getElementType()); 390 } 391 392 return false; 393 } 394 395 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { 396 return Ty->isFloatTy() || Ty->isDoubleTy() || 397 (Ty->isHalfTy() && ST->has16BitInsts()); 398 } 399 400 // Return true if the op promoted to i32 should have nsw set. 401 static bool promotedOpIsNSW(const Instruction &I) { 402 switch (I.getOpcode()) { 403 case Instruction::Shl: 404 case Instruction::Add: 405 case Instruction::Sub: 406 return true; 407 case Instruction::Mul: 408 return I.hasNoUnsignedWrap(); 409 default: 410 return false; 411 } 412 } 413 414 // Return true if the op promoted to i32 should have nuw set. 415 static bool promotedOpIsNUW(const Instruction &I) { 416 switch (I.getOpcode()) { 417 case Instruction::Shl: 418 case Instruction::Add: 419 case Instruction::Mul: 420 return true; 421 case Instruction::Sub: 422 return I.hasNoUnsignedWrap(); 423 default: 424 return false; 425 } 426 } 427 428 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { 429 Type *Ty = I.getType(); 430 const DataLayout &DL = Mod->getDataLayout(); 431 int TySize = DL.getTypeSizeInBits(Ty); 432 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); 433 434 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I); 435 } 436 437 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { 438 assert(needsPromotionToI32(I.getType()) && 439 "I does not need promotion to i32"); 440 441 if (I.getOpcode() == Instruction::SDiv || 442 I.getOpcode() == Instruction::UDiv || 443 I.getOpcode() == Instruction::SRem || 444 I.getOpcode() == Instruction::URem) 445 return false; 446 447 IRBuilder<> Builder(&I); 448 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 449 450 Type *I32Ty = getI32Ty(Builder, I.getType()); 451 Value *ExtOp0 = nullptr; 452 Value *ExtOp1 = nullptr; 453 Value *ExtRes = nullptr; 454 Value *TruncRes = nullptr; 455 456 if (isSigned(I)) { 457 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 458 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 459 } else { 460 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 461 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 462 } 463 464 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 465 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 466 if (promotedOpIsNSW(cast<Instruction>(I))) 467 Inst->setHasNoSignedWrap(); 468 469 if (promotedOpIsNUW(cast<Instruction>(I))) 470 Inst->setHasNoUnsignedWrap(); 471 472 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 473 Inst->setIsExact(ExactOp->isExact()); 474 } 475 476 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 477 478 I.replaceAllUsesWith(TruncRes); 479 I.eraseFromParent(); 480 481 return true; 482 } 483 484 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { 485 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 486 "I does not need promotion to i32"); 487 488 IRBuilder<> Builder(&I); 489 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 490 491 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 492 Value *ExtOp0 = nullptr; 493 Value *ExtOp1 = nullptr; 494 Value *NewICmp = nullptr; 495 496 if (I.isSigned()) { 497 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 498 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 499 } else { 500 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 501 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 502 } 503 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 504 505 I.replaceAllUsesWith(NewICmp); 506 I.eraseFromParent(); 507 508 return true; 509 } 510 511 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { 512 assert(needsPromotionToI32(I.getType()) && 513 "I does not need promotion to i32"); 514 515 IRBuilder<> Builder(&I); 516 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 517 518 Type *I32Ty = getI32Ty(Builder, I.getType()); 519 Value *ExtOp1 = nullptr; 520 Value *ExtOp2 = nullptr; 521 Value *ExtRes = nullptr; 522 Value *TruncRes = nullptr; 523 524 if (isSigned(I)) { 525 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 526 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 527 } else { 528 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 529 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 530 } 531 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 532 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 533 534 I.replaceAllUsesWith(TruncRes); 535 I.eraseFromParent(); 536 537 return true; 538 } 539 540 bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( 541 IntrinsicInst &I) const { 542 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 543 "I must be bitreverse intrinsic"); 544 assert(needsPromotionToI32(I.getType()) && 545 "I does not need promotion to i32"); 546 547 IRBuilder<> Builder(&I); 548 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 549 550 Type *I32Ty = getI32Ty(Builder, I.getType()); 551 Function *I32 = 552 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 553 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 554 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 555 Value *LShrOp = 556 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 557 Value *TruncRes = 558 Builder.CreateTrunc(LShrOp, I.getType()); 559 560 I.replaceAllUsesWith(TruncRes); 561 I.eraseFromParent(); 562 563 return true; 564 } 565 566 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { 567 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits(); 568 } 569 570 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { 571 return ComputeMaxSignificantBits(Op, *DL, 0, AC); 572 } 573 574 static void extractValues(IRBuilder<> &Builder, 575 SmallVectorImpl<Value *> &Values, Value *V) { 576 auto *VT = dyn_cast<FixedVectorType>(V->getType()); 577 if (!VT) { 578 Values.push_back(V); 579 return; 580 } 581 582 for (int I = 0, E = VT->getNumElements(); I != E; ++I) 583 Values.push_back(Builder.CreateExtractElement(V, I)); 584 } 585 586 static Value *insertValues(IRBuilder<> &Builder, 587 Type *Ty, 588 SmallVectorImpl<Value *> &Values) { 589 if (!Ty->isVectorTy()) { 590 assert(Values.size() == 1); 591 return Values[0]; 592 } 593 594 Value *NewVal = PoisonValue::get(Ty); 595 for (int I = 0, E = Values.size(); I != E; ++I) 596 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); 597 598 return NewVal; 599 } 600 601 // Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and 602 // `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the 603 // width of the original destination. 604 static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, 605 unsigned Size, unsigned NumBits, bool IsSigned) { 606 if (Size <= 32 || NumBits <= 32) { 607 Intrinsic::ID ID = 608 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; 609 return Builder.CreateIntrinsic(ID, {}, {LHS, RHS}); 610 } 611 612 assert(NumBits <= 48); 613 614 Intrinsic::ID LoID = 615 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; 616 Intrinsic::ID HiID = 617 IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24; 618 619 Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS}); 620 Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS}); 621 622 IntegerType *I64Ty = Builder.getInt64Ty(); 623 Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty); 624 Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty); 625 626 return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); 627 } 628 629 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { 630 if (I.getOpcode() != Instruction::Mul) 631 return false; 632 633 Type *Ty = I.getType(); 634 unsigned Size = Ty->getScalarSizeInBits(); 635 if (Size <= 16 && ST->has16BitInsts()) 636 return false; 637 638 // Prefer scalar if this could be s_mul_i32 639 if (UA->isUniform(&I)) 640 return false; 641 642 Value *LHS = I.getOperand(0); 643 Value *RHS = I.getOperand(1); 644 IRBuilder<> Builder(&I); 645 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 646 647 unsigned LHSBits = 0, RHSBits = 0; 648 bool IsSigned = false; 649 650 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && 651 (RHSBits = numBitsUnsigned(RHS)) <= 24) { 652 IsSigned = false; 653 654 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && 655 (RHSBits = numBitsSigned(RHS)) <= 24) { 656 IsSigned = true; 657 658 } else 659 return false; 660 661 SmallVector<Value *, 4> LHSVals; 662 SmallVector<Value *, 4> RHSVals; 663 SmallVector<Value *, 4> ResultVals; 664 extractValues(Builder, LHSVals, LHS); 665 extractValues(Builder, RHSVals, RHS); 666 667 IntegerType *I32Ty = Builder.getInt32Ty(); 668 for (int I = 0, E = LHSVals.size(); I != E; ++I) { 669 Value *LHS, *RHS; 670 if (IsSigned) { 671 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); 672 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); 673 } else { 674 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); 675 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); 676 } 677 678 Value *Result = 679 getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned); 680 681 if (IsSigned) { 682 ResultVals.push_back( 683 Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType())); 684 } else { 685 ResultVals.push_back( 686 Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); 687 } 688 } 689 690 Value *NewVal = insertValues(Builder, Ty, ResultVals); 691 NewVal->takeName(&I); 692 I.replaceAllUsesWith(NewVal); 693 I.eraseFromParent(); 694 695 return true; 696 } 697 698 // Find a select instruction, which may have been casted. This is mostly to deal 699 // with cases where i16 selects were promoted here to i32. 700 static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { 701 Cast = nullptr; 702 if (SelectInst *Sel = dyn_cast<SelectInst>(V)) 703 return Sel; 704 705 if ((Cast = dyn_cast<CastInst>(V))) { 706 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) 707 return Sel; 708 } 709 710 return nullptr; 711 } 712 713 bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { 714 // Don't do this unless the old select is going away. We want to eliminate the 715 // binary operator, not replace a binop with a select. 716 int SelOpNo = 0; 717 718 CastInst *CastOp; 719 720 // TODO: Should probably try to handle some cases with multiple 721 // users. Duplicating the select may be profitable for division. 722 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); 723 if (!Sel || !Sel->hasOneUse()) { 724 SelOpNo = 1; 725 Sel = findSelectThroughCast(BO.getOperand(1), CastOp); 726 } 727 728 if (!Sel || !Sel->hasOneUse()) 729 return false; 730 731 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); 732 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); 733 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1)); 734 if (!CBO || !CT || !CF) 735 return false; 736 737 if (CastOp) { 738 if (!CastOp->hasOneUse()) 739 return false; 740 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); 741 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); 742 } 743 744 // TODO: Handle special 0/-1 cases DAG combine does, although we only really 745 // need to handle divisions here. 746 Constant *FoldedT = SelOpNo ? 747 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : 748 ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); 749 if (!FoldedT || isa<ConstantExpr>(FoldedT)) 750 return false; 751 752 Constant *FoldedF = SelOpNo ? 753 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : 754 ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); 755 if (!FoldedF || isa<ConstantExpr>(FoldedF)) 756 return false; 757 758 IRBuilder<> Builder(&BO); 759 Builder.SetCurrentDebugLocation(BO.getDebugLoc()); 760 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO)) 761 Builder.setFastMathFlags(FPOp->getFastMathFlags()); 762 763 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), 764 FoldedT, FoldedF); 765 NewSelect->takeName(&BO); 766 BO.replaceAllUsesWith(NewSelect); 767 BO.eraseFromParent(); 768 if (CastOp) 769 CastOp->eraseFromParent(); 770 Sel->eraseFromParent(); 771 return true; 772 } 773 774 std::pair<Value *, Value *> 775 AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder, 776 Value *Src) const { 777 Type *Ty = Src->getType(); 778 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, 779 {Ty, Builder.getInt32Ty()}, Src); 780 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); 781 782 // Bypass the bug workaround for the exponent result since it doesn't matter. 783 // TODO: Does the bug workaround even really need to consider the exponent 784 // result? It's unspecified by the spec. 785 786 Value *FrexpExp = 787 ST->hasFractBug() 788 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, 789 {Builder.getInt32Ty(), Ty}, Src) 790 : Builder.CreateExtractValue(Frexp, {1}); 791 return {FrexpMant, FrexpExp}; 792 } 793 794 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals. 795 Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder, 796 Value *Src, 797 bool IsNegative) const { 798 // Same as for 1.0, but expand the sign out of the constant. 799 // -1.0 / x -> rcp (fneg x) 800 if (IsNegative) 801 Src = Builder.CreateFNeg(Src); 802 803 // The rcp instruction doesn't support denormals, so scale the input 804 // out of the denormal range and convert at the end. 805 // 806 // Expand as 2^-n * (1.0 / (x * 2^n)) 807 808 // TODO: Skip scaling if input is known never denormal and the input 809 // range won't underflow to denormal. The hard part is knowing the 810 // result. We need a range check, the result could be denormal for 811 // 0x1p+126 < den <= 0x1p+127. 812 813 Type *Ty = Src->getType(); 814 815 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); 816 Value *ScaleFactor = Builder.CreateNeg(FrexpExp); 817 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); 818 return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, 819 {Rcp, ScaleFactor}); 820 } 821 822 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling. 823 Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, 824 Value *RHS, 825 FastMathFlags FMF) const { 826 // If we have have to work around the fract/frexp bug, we're worse off than 827 // using the fdiv.fast expansion. The full safe expansion is faster if we have 828 // fast FMA. 829 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() && 830 (!FMF.noNaNs() || !FMF.noInfs())) 831 return nullptr; 832 833 // We're scaling the LHS to avoid a denormal input, and scale the denominator 834 // to avoid large values underflowing the result. 835 Type *Ty = LHS->getType(); 836 837 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); 838 839 Value *Rcp = 840 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS); 841 842 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS); 843 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp); 844 845 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the 846 // result. 847 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); 848 return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, 849 {Mul, ExpDiff}); 850 } 851 852 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. 853 static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, 854 bool IsNegative) { 855 // bool need_scale = x < 0x1p-126f; 856 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f; 857 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f; 858 // rsq(x * input_scale) * output_scale; 859 860 Type *Ty = Src->getType(); 861 APFloat SmallestNormal = 862 APFloat::getSmallestNormalized(Ty->getFltSemantics()); 863 Value *NeedScale = 864 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); 865 Constant *One = ConstantFP::get(Ty, 1.0); 866 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24); 867 Constant *OutputScale = 868 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12); 869 870 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One); 871 872 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor); 873 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput); 874 Value *OutputScaleFactor = Builder.CreateSelect( 875 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One); 876 877 return Builder.CreateFMul(Rsq, OutputScaleFactor); 878 } 879 880 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, 881 FastMathFlags DivFMF, 882 FastMathFlags SqrtFMF) const { 883 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. 884 if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) 885 return false; 886 887 // v_rsq_f32 gives 1ulp 888 return SqrtFMF.approxFunc() || HasUnsafeFPMath || 889 SqrtOp->getFPAccuracy() >= 1.0f; 890 } 891 892 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( 893 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, 894 FastMathFlags SqrtFMF, const Instruction *CtxI) const { 895 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. 896 assert(DivFMF.allowContract() && SqrtFMF.allowContract()); 897 898 // rsq_f16 is accurate to 0.51 ulp. 899 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. 900 // rsq_f64 is never accurate. 901 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num); 902 if (!CLHS) 903 return nullptr; 904 905 assert(Den->getType()->isFloatTy()); 906 907 bool IsNegative = false; 908 909 // TODO: Handle other numerator values with arcp. 910 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { 911 // Add in the sqrt flags. 912 IRBuilder<>::FastMathFlagGuard Guard(Builder); 913 DivFMF |= SqrtFMF; 914 Builder.setFastMathFlags(DivFMF); 915 916 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || 917 canIgnoreDenormalInput(Den, CtxI)) { 918 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); 919 // -1.0 / sqrt(x) -> fneg(rsq(x)) 920 return IsNegative ? Builder.CreateFNeg(Result) : Result; 921 } 922 923 return emitRsqIEEE1ULP(Builder, Den, IsNegative); 924 } 925 926 return nullptr; 927 } 928 929 // Optimize fdiv with rcp: 930 // 931 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 932 // allowed with unsafe-fp-math or afn. 933 // 934 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 935 Value * 936 AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num, 937 Value *Den, FastMathFlags FMF, 938 const Instruction *CtxI) const { 939 // rcp_f16 is accurate to 0.51 ulp. 940 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. 941 // rcp_f64 is never accurate. 942 assert(Den->getType()->isFloatTy()); 943 944 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { 945 bool IsNegative = false; 946 if (CLHS->isExactlyValue(1.0) || 947 (IsNegative = CLHS->isExactlyValue(-1.0))) { 948 Value *Src = Den; 949 950 if (HasFP32DenormalFlush || FMF.approxFunc()) { 951 // -1.0 / x -> 1.0 / fneg(x) 952 if (IsNegative) 953 Src = Builder.CreateFNeg(Src); 954 955 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 956 // the CI documentation has a worst case error of 1 ulp. 957 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK 958 // to use it as long as we aren't trying to use denormals. 959 // 960 // v_rcp_f16 and v_rsq_f16 DO support denormals. 961 962 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't 963 // insert rsq intrinsic here. 964 965 // 1.0 / x -> rcp(x) 966 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src); 967 } 968 969 // TODO: If the input isn't denormal, and we know the input exponent isn't 970 // big enough to introduce a denormal we can avoid the scaling. 971 return emitRcpIEEE1ULP(Builder, Src, IsNegative); 972 } 973 } 974 975 if (FMF.allowReciprocal()) { 976 // x / y -> x * (1.0 / y) 977 978 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output 979 // will never underflow. 980 if (HasFP32DenormalFlush || FMF.approxFunc()) { 981 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den); 982 return Builder.CreateFMul(Num, Recip); 983 } 984 985 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false); 986 return Builder.CreateFMul(Num, Recip); 987 } 988 989 return nullptr; 990 } 991 992 // optimize with fdiv.fast: 993 // 994 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 995 // 996 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 997 // 998 // NOTE: optimizeWithRcp should be tried first because rcp is the preference. 999 Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( 1000 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const { 1001 // fdiv.fast can achieve 2.5 ULP accuracy. 1002 if (ReqdAccuracy < 2.5f) 1003 return nullptr; 1004 1005 // Only have fdiv.fast for f32. 1006 assert(Den->getType()->isFloatTy()); 1007 1008 bool NumIsOne = false; 1009 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { 1010 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) 1011 NumIsOne = true; 1012 } 1013 1014 // fdiv does not support denormals. But 1.0/x is always fine to use it. 1015 // 1016 // TODO: This works for any value with a specific known exponent range, don't 1017 // just limit to constant 1. 1018 if (!HasFP32DenormalFlush && !NumIsOne) 1019 return nullptr; 1020 1021 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den}); 1022 } 1023 1024 Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( 1025 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, 1026 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst, 1027 float ReqdDivAccuracy) const { 1028 if (RsqOp) { 1029 Value *Rsq = 1030 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst); 1031 if (Rsq) 1032 return Rsq; 1033 } 1034 1035 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst); 1036 if (Rcp) 1037 return Rcp; 1038 1039 // In the basic case fdiv_fast has the same instruction count as the frexp div 1040 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can 1041 // potentially be fused into a user. Also, materialization of the constants 1042 // can be reused for multiple instances. 1043 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); 1044 if (FDivFast) 1045 return FDivFast; 1046 1047 return emitFrexpDiv(Builder, Num, Den, DivFMF); 1048 } 1049 1050 // Optimizations is performed based on fpmath, fast math flags as well as 1051 // denormals to optimize fdiv with either rcp or fdiv.fast. 1052 // 1053 // With rcp: 1054 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is 1055 // allowed with unsafe-fp-math or afn. 1056 // 1057 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. 1058 // 1059 // With fdiv.fast: 1060 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. 1061 // 1062 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. 1063 // 1064 // NOTE: rcp is the preference in cases that both are legal. 1065 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { 1066 if (DisableFDivExpand) 1067 return false; 1068 1069 Type *Ty = FDiv.getType()->getScalarType(); 1070 if (!Ty->isFloatTy()) 1071 return false; 1072 1073 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an 1074 // expansion around them in codegen. f16 is good enough to always use. 1075 1076 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 1077 const FastMathFlags DivFMF = FPOp->getFastMathFlags(); 1078 const float ReqdAccuracy = FPOp->getFPAccuracy(); 1079 1080 // Inaccurate rcp is allowed with unsafe-fp-math or afn. 1081 // 1082 // Defer to codegen to handle this. 1083 // 1084 // TODO: Decide on an interpretation for interactions between afn + arcp + 1085 // !fpmath, and make it consistent between here and codegen. For now, defer 1086 // expansion of afn to codegen. The current interpretation is so aggressive we 1087 // don't need any pre-consideration here when we have better information. A 1088 // more conservative interpretation could use handling here. 1089 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); 1090 if (AllowInaccurateRcp) 1091 return false; 1092 1093 // Defer the correct implementations to codegen. 1094 if (ReqdAccuracy < 1.0f) 1095 return false; 1096 1097 FastMathFlags SqrtFMF; 1098 1099 Value *Num = FDiv.getOperand(0); 1100 Value *Den = FDiv.getOperand(1); 1101 1102 Value *RsqOp = nullptr; 1103 auto *DenII = dyn_cast<IntrinsicInst>(Den); 1104 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && 1105 DenII->hasOneUse()) { 1106 const auto *SqrtOp = cast<FPMathOperator>(DenII); 1107 SqrtFMF = SqrtOp->getFastMathFlags(); 1108 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) 1109 RsqOp = SqrtOp->getOperand(0); 1110 } 1111 1112 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); 1113 Builder.setFastMathFlags(DivFMF); 1114 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 1115 1116 SmallVector<Value *, 4> NumVals; 1117 SmallVector<Value *, 4> DenVals; 1118 SmallVector<Value *, 4> RsqDenVals; 1119 extractValues(Builder, NumVals, Num); 1120 extractValues(Builder, DenVals, Den); 1121 1122 if (RsqOp) 1123 extractValues(Builder, RsqDenVals, RsqOp); 1124 1125 SmallVector<Value *, 4> ResultVals(NumVals.size()); 1126 for (int I = 0, E = NumVals.size(); I != E; ++I) { 1127 Value *NumElt = NumVals[I]; 1128 Value *DenElt = DenVals[I]; 1129 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr; 1130 1131 Value *NewElt = 1132 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt, 1133 cast<Instruction>(FPOp), ReqdAccuracy); 1134 if (!NewElt) { 1135 // Keep the original, but scalarized. 1136 1137 // This has the unfortunate side effect of sometimes scalarizing when 1138 // we're not going to do anything. 1139 NewElt = Builder.CreateFDiv(NumElt, DenElt); 1140 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt)) 1141 NewEltInst->copyMetadata(FDiv); 1142 } 1143 1144 ResultVals[I] = NewElt; 1145 } 1146 1147 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals); 1148 1149 if (NewVal) { 1150 FDiv.replaceAllUsesWith(NewVal); 1151 NewVal->takeName(&FDiv); 1152 RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo); 1153 } 1154 1155 return true; 1156 } 1157 1158 static bool hasUnsafeFPMath(const Function &F) { 1159 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 1160 return Attr.getValueAsBool(); 1161 } 1162 1163 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 1164 Value *LHS, Value *RHS) { 1165 Type *I32Ty = Builder.getInt32Ty(); 1166 Type *I64Ty = Builder.getInt64Ty(); 1167 1168 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 1169 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 1170 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 1171 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 1172 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 1173 Hi = Builder.CreateTrunc(Hi, I32Ty); 1174 return std::pair(Lo, Hi); 1175 } 1176 1177 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 1178 return getMul64(Builder, LHS, RHS).second; 1179 } 1180 1181 /// Figure out how many bits are really needed for this division. \p AtLeast is 1182 /// an optimization hint to bypass the second ComputeNumSignBits call if we the 1183 /// first one is insufficient. Returns -1 on failure. 1184 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, 1185 Value *Den, unsigned AtLeast, 1186 bool IsSigned) const { 1187 const DataLayout &DL = Mod->getDataLayout(); 1188 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 1189 if (LHSSignBits < AtLeast) 1190 return -1; 1191 1192 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 1193 if (RHSSignBits < AtLeast) 1194 return -1; 1195 1196 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1197 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; 1198 if (IsSigned) 1199 ++DivBits; 1200 return DivBits; 1201 } 1202 1203 // The fractional part of a float is enough to accurately represent up to 1204 // a 24-bit signed integer. 1205 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder, 1206 BinaryOperator &I, Value *Num, 1207 Value *Den, bool IsDiv, 1208 bool IsSigned) const { 1209 int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); 1210 if (DivBits == -1) 1211 return nullptr; 1212 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); 1213 } 1214 1215 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( 1216 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, 1217 unsigned DivBits, bool IsDiv, bool IsSigned) const { 1218 Type *I32Ty = Builder.getInt32Ty(); 1219 Num = Builder.CreateTrunc(Num, I32Ty); 1220 Den = Builder.CreateTrunc(Den, I32Ty); 1221 1222 Type *F32Ty = Builder.getFloatTy(); 1223 ConstantInt *One = Builder.getInt32(1); 1224 Value *JQ = One; 1225 1226 if (IsSigned) { 1227 // char|short jq = ia ^ ib; 1228 JQ = Builder.CreateXor(Num, Den); 1229 1230 // jq = jq >> (bitsize - 2) 1231 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 1232 1233 // jq = jq | 0x1 1234 JQ = Builder.CreateOr(JQ, One); 1235 } 1236 1237 // int ia = (int)LHS; 1238 Value *IA = Num; 1239 1240 // int ib, (int)RHS; 1241 Value *IB = Den; 1242 1243 // float fa = (float)ia; 1244 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 1245 : Builder.CreateUIToFP(IA, F32Ty); 1246 1247 // float fb = (float)ib; 1248 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 1249 : Builder.CreateUIToFP(IB,F32Ty); 1250 1251 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, 1252 Builder.getFloatTy()); 1253 Value *RCP = Builder.CreateCall(RcpDecl, { FB }); 1254 Value *FQM = Builder.CreateFMul(FA, RCP); 1255 1256 // fq = trunc(fqm); 1257 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 1258 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 1259 1260 // float fqneg = -fq; 1261 Value *FQNeg = Builder.CreateFNeg(FQ); 1262 1263 // float fr = mad(fqneg, fb, fa); 1264 auto FMAD = !ST->hasMadMacF32Insts() 1265 ? Intrinsic::fma 1266 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz; 1267 Value *FR = Builder.CreateIntrinsic(FMAD, 1268 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 1269 1270 // int iq = (int)fq; 1271 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 1272 : Builder.CreateFPToUI(FQ, I32Ty); 1273 1274 // fr = fabs(fr); 1275 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 1276 1277 // fb = fabs(fb); 1278 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 1279 1280 // int cv = fr >= fb; 1281 Value *CV = Builder.CreateFCmpOGE(FR, FB); 1282 1283 // jq = (cv ? jq : 0); 1284 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 1285 1286 // dst = iq + jq; 1287 Value *Div = Builder.CreateAdd(IQ, JQ); 1288 1289 Value *Res = Div; 1290 if (!IsDiv) { 1291 // Rem needs compensation, it's easier to recompute it 1292 Value *Rem = Builder.CreateMul(Div, Den); 1293 Res = Builder.CreateSub(Num, Rem); 1294 } 1295 1296 if (DivBits != 0 && DivBits < 32) { 1297 // Extend in register from the number of bits this divide really is. 1298 if (IsSigned) { 1299 int InRegBits = 32 - DivBits; 1300 1301 Res = Builder.CreateShl(Res, InRegBits); 1302 Res = Builder.CreateAShr(Res, InRegBits); 1303 } else { 1304 ConstantInt *TruncMask 1305 = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 1306 Res = Builder.CreateAnd(Res, TruncMask); 1307 } 1308 } 1309 1310 return Res; 1311 } 1312 1313 // Try to recognize special cases the DAG will emit special, better expansions 1314 // than the general expansion we do here. 1315 1316 // TODO: It would be better to just directly handle those optimizations here. 1317 bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, 1318 Value *Num, 1319 Value *Den) const { 1320 if (Constant *C = dyn_cast<Constant>(Den)) { 1321 // Arbitrary constants get a better expansion as long as a wider mulhi is 1322 // legal. 1323 if (C->getType()->getScalarSizeInBits() <= 32) 1324 return true; 1325 1326 // TODO: Sdiv check for not exact for some reason. 1327 1328 // If there's no wider mulhi, there's only a better expansion for powers of 1329 // two. 1330 // TODO: Should really know for each vector element. 1331 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT)) 1332 return true; 1333 1334 return false; 1335 } 1336 1337 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) { 1338 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 1339 if (BinOpDen->getOpcode() == Instruction::Shl && 1340 isa<Constant>(BinOpDen->getOperand(0)) && 1341 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, 1342 0, AC, &I, DT)) { 1343 return true; 1344 } 1345 } 1346 1347 return false; 1348 } 1349 1350 static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { 1351 // Check whether the sign can be determined statically. 1352 KnownBits Known = computeKnownBits(V, *DL); 1353 if (Known.isNegative()) 1354 return Constant::getAllOnesValue(V->getType()); 1355 if (Known.isNonNegative()) 1356 return Constant::getNullValue(V->getType()); 1357 return Builder.CreateAShr(V, Builder.getInt32(31)); 1358 } 1359 1360 Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, 1361 BinaryOperator &I, Value *X, 1362 Value *Y) const { 1363 Instruction::BinaryOps Opc = I.getOpcode(); 1364 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 1365 Opc == Instruction::SRem || Opc == Instruction::SDiv); 1366 1367 FastMathFlags FMF; 1368 FMF.setFast(); 1369 Builder.setFastMathFlags(FMF); 1370 1371 if (divHasSpecialOptimization(I, X, Y)) 1372 return nullptr; // Keep it for later optimization. 1373 1374 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 1375 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 1376 1377 Type *Ty = X->getType(); 1378 Type *I32Ty = Builder.getInt32Ty(); 1379 Type *F32Ty = Builder.getFloatTy(); 1380 1381 if (Ty->getScalarSizeInBits() < 32) { 1382 if (IsSigned) { 1383 X = Builder.CreateSExt(X, I32Ty); 1384 Y = Builder.CreateSExt(Y, I32Ty); 1385 } else { 1386 X = Builder.CreateZExt(X, I32Ty); 1387 Y = Builder.CreateZExt(Y, I32Ty); 1388 } 1389 } 1390 1391 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) { 1392 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : 1393 Builder.CreateZExtOrTrunc(Res, Ty); 1394 } 1395 1396 ConstantInt *Zero = Builder.getInt32(0); 1397 ConstantInt *One = Builder.getInt32(1); 1398 1399 Value *Sign = nullptr; 1400 if (IsSigned) { 1401 Value *SignX = getSign32(X, Builder, DL); 1402 Value *SignY = getSign32(Y, Builder, DL); 1403 // Remainder sign is the same as LHS 1404 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX; 1405 1406 X = Builder.CreateAdd(X, SignX); 1407 Y = Builder.CreateAdd(Y, SignY); 1408 1409 X = Builder.CreateXor(X, SignX); 1410 Y = Builder.CreateXor(Y, SignY); 1411 } 1412 1413 // The algorithm here is based on ideas from "Software Integer Division", Tom 1414 // Rodeheffer, August 2008. 1415 // 1416 // unsigned udiv(unsigned x, unsigned y) { 1417 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure 1418 // // that this is a lower bound on inv(y), even if some of the calculations 1419 // // round up. 1420 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y)); 1421 // 1422 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z. 1423 // // Empirically this is guaranteed to give a "two-y" lower bound on 1424 // // inv(y). 1425 // z += umulh(z, -y * z); 1426 // 1427 // // Quotient/remainder estimate. 1428 // unsigned q = umulh(x, z); 1429 // unsigned r = x - q * y; 1430 // 1431 // // Two rounds of quotient/remainder refinement. 1432 // if (r >= y) { 1433 // ++q; 1434 // r -= y; 1435 // } 1436 // if (r >= y) { 1437 // ++q; 1438 // r -= y; 1439 // } 1440 // 1441 // return q; 1442 // } 1443 1444 // Initial estimate of inv(y). 1445 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); 1446 Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); 1447 Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); 1448 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE)); 1449 Value *ScaledY = Builder.CreateFMul(RcpY, Scale); 1450 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); 1451 1452 // One round of UNR. 1453 Value *NegY = Builder.CreateSub(Zero, Y); 1454 Value *NegYZ = Builder.CreateMul(NegY, Z); 1455 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ)); 1456 1457 // Quotient/remainder estimate. 1458 Value *Q = getMulHu(Builder, X, Z); 1459 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y)); 1460 1461 // First quotient/remainder refinement. 1462 Value *Cond = Builder.CreateICmpUGE(R, Y); 1463 if (IsDiv) 1464 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); 1465 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); 1466 1467 // Second quotient/remainder refinement. 1468 Cond = Builder.CreateICmpUGE(R, Y); 1469 Value *Res; 1470 if (IsDiv) 1471 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); 1472 else 1473 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); 1474 1475 if (IsSigned) { 1476 Res = Builder.CreateXor(Res, Sign); 1477 Res = Builder.CreateSub(Res, Sign); 1478 } 1479 1480 Res = Builder.CreateTrunc(Res, Ty); 1481 1482 return Res; 1483 } 1484 1485 Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder, 1486 BinaryOperator &I, Value *Num, 1487 Value *Den) const { 1488 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) 1489 return nullptr; // Keep it for later optimization. 1490 1491 Instruction::BinaryOps Opc = I.getOpcode(); 1492 1493 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; 1494 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; 1495 1496 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); 1497 if (NumDivBits == -1) 1498 return nullptr; 1499 1500 Value *Narrowed = nullptr; 1501 if (NumDivBits <= 24) { 1502 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, 1503 IsDiv, IsSigned); 1504 } else if (NumDivBits <= 32) { 1505 Narrowed = expandDivRem32(Builder, I, Num, Den); 1506 } 1507 1508 if (Narrowed) { 1509 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : 1510 Builder.CreateZExt(Narrowed, Num->getType()); 1511 } 1512 1513 return nullptr; 1514 } 1515 1516 void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { 1517 Instruction::BinaryOps Opc = I.getOpcode(); 1518 // Do the general expansion. 1519 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { 1520 expandDivisionUpTo64Bits(&I); 1521 return; 1522 } 1523 1524 if (Opc == Instruction::URem || Opc == Instruction::SRem) { 1525 expandRemainderUpTo64Bits(&I); 1526 return; 1527 } 1528 1529 llvm_unreachable("not a division"); 1530 } 1531 1532 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { 1533 if (foldBinOpIntoSelect(I)) 1534 return true; 1535 1536 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1537 UA->isUniform(&I) && promoteUniformOpToI32(I)) 1538 return true; 1539 1540 if (UseMul24Intrin && replaceMulWithMul24(I)) 1541 return true; 1542 1543 bool Changed = false; 1544 Instruction::BinaryOps Opc = I.getOpcode(); 1545 Type *Ty = I.getType(); 1546 Value *NewDiv = nullptr; 1547 unsigned ScalarSize = Ty->getScalarSizeInBits(); 1548 1549 SmallVector<BinaryOperator *, 8> Div64ToExpand; 1550 1551 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 1552 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 1553 ScalarSize <= 64 && 1554 !DisableIDivExpand) { 1555 Value *Num = I.getOperand(0); 1556 Value *Den = I.getOperand(1); 1557 IRBuilder<> Builder(&I); 1558 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 1559 1560 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 1561 NewDiv = PoisonValue::get(VT); 1562 1563 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 1564 Value *NumEltN = Builder.CreateExtractElement(Num, N); 1565 Value *DenEltN = Builder.CreateExtractElement(Den, N); 1566 1567 Value *NewElt; 1568 if (ScalarSize <= 32) { 1569 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 1570 if (!NewElt) 1571 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 1572 } else { 1573 // See if this 64-bit division can be shrunk to 32/24-bits before 1574 // producing the general expansion. 1575 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); 1576 if (!NewElt) { 1577 // The general 64-bit expansion introduces control flow and doesn't 1578 // return the new value. Just insert a scalar copy and defer 1579 // expanding it. 1580 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 1581 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt)); 1582 } 1583 } 1584 1585 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 1586 } 1587 } else { 1588 if (ScalarSize <= 32) 1589 NewDiv = expandDivRem32(Builder, I, Num, Den); 1590 else { 1591 NewDiv = shrinkDivRem64(Builder, I, Num, Den); 1592 if (!NewDiv) 1593 Div64ToExpand.push_back(&I); 1594 } 1595 } 1596 1597 if (NewDiv) { 1598 I.replaceAllUsesWith(NewDiv); 1599 I.eraseFromParent(); 1600 Changed = true; 1601 } 1602 } 1603 1604 if (ExpandDiv64InIR) { 1605 // TODO: We get much worse code in specially handled constant cases. 1606 for (BinaryOperator *Div : Div64ToExpand) { 1607 expandDivRem64(*Div); 1608 FlowChanged = true; 1609 Changed = true; 1610 } 1611 } 1612 1613 return Changed; 1614 } 1615 1616 bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { 1617 if (!WidenLoads) 1618 return false; 1619 1620 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 1621 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 1622 canWidenScalarExtLoad(I)) { 1623 IRBuilder<> Builder(&I); 1624 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 1625 1626 Type *I32Ty = Builder.getInt32Ty(); 1627 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand()); 1628 WidenLoad->copyMetadata(I); 1629 1630 // If we have range metadata, we need to convert the type, and not make 1631 // assumptions about the high bits. 1632 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 1633 ConstantInt *Lower = 1634 mdconst::extract<ConstantInt>(Range->getOperand(0)); 1635 1636 if (Lower->isNullValue()) { 1637 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 1638 } else { 1639 Metadata *LowAndHigh[] = { 1640 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 1641 // Don't make assumptions about the high bits. 1642 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 1643 }; 1644 1645 WidenLoad->setMetadata(LLVMContext::MD_range, 1646 MDNode::get(Mod->getContext(), LowAndHigh)); 1647 } 1648 } 1649 1650 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 1651 Type *IntNTy = Builder.getIntNTy(TySize); 1652 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 1653 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 1654 I.replaceAllUsesWith(ValOrig); 1655 I.eraseFromParent(); 1656 return true; 1657 } 1658 1659 return false; 1660 } 1661 1662 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { 1663 bool Changed = false; 1664 1665 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 1666 UA->isUniform(&I)) 1667 Changed |= promoteUniformOpToI32(I); 1668 1669 return Changed; 1670 } 1671 1672 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { 1673 Value *Cond = I.getCondition(); 1674 Value *TrueVal = I.getTrueValue(); 1675 Value *FalseVal = I.getFalseValue(); 1676 Value *CmpVal; 1677 FCmpInst::Predicate Pred; 1678 1679 if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) { 1680 if (UA->isUniform(&I)) 1681 return promoteUniformOpToI32(I); 1682 return false; 1683 } 1684 1685 // Match fract pattern with nan check. 1686 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN()))) 1687 return false; 1688 1689 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I); 1690 if (!FPOp) 1691 return false; 1692 1693 IRBuilder<> Builder(&I); 1694 Builder.setFastMathFlags(FPOp->getFastMathFlags()); 1695 1696 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal); 1697 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal); 1698 1699 Value *Fract = nullptr; 1700 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse && 1701 CmpVal == matchFractPat(*IIFalse)) { 1702 // isnan(x) ? x : fract(x) 1703 Fract = applyFractPat(Builder, CmpVal); 1704 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue && 1705 CmpVal == matchFractPat(*IITrue)) { 1706 // !isnan(x) ? fract(x) : x 1707 Fract = applyFractPat(Builder, CmpVal); 1708 } else 1709 return false; 1710 1711 Fract->takeName(&I); 1712 I.replaceAllUsesWith(Fract); 1713 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); 1714 return true; 1715 } 1716 1717 static bool areInSameBB(const Value *A, const Value *B) { 1718 const auto *IA = dyn_cast<Instruction>(A); 1719 const auto *IB = dyn_cast<Instruction>(B); 1720 return IA && IB && IA->getParent() == IB->getParent(); 1721 } 1722 1723 // Helper for breaking large PHIs that returns true when an extractelement on V 1724 // is likely to be folded away by the DAG combiner. 1725 static bool isInterestingPHIIncomingValue(const Value *V) { 1726 const auto *FVT = dyn_cast<FixedVectorType>(V->getType()); 1727 if (!FVT) 1728 return false; 1729 1730 const Value *CurVal = V; 1731 1732 // Check for insertelements, keeping track of the elements covered. 1733 BitVector EltsCovered(FVT->getNumElements()); 1734 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) { 1735 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2)); 1736 1737 // Non constant index/out of bounds index -> folding is unlikely. 1738 // The latter is more of a sanity check because canonical IR should just 1739 // have replaced those with poison. 1740 if (!Idx || Idx->getSExtValue() >= FVT->getNumElements()) 1741 return false; 1742 1743 const auto *VecSrc = IE->getOperand(0); 1744 1745 // If the vector source is another instruction, it must be in the same basic 1746 // block. Otherwise, the DAGCombiner won't see the whole thing and is 1747 // unlikely to be able to do anything interesting here. 1748 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE)) 1749 return false; 1750 1751 CurVal = VecSrc; 1752 EltsCovered.set(Idx->getSExtValue()); 1753 1754 // All elements covered. 1755 if (EltsCovered.all()) 1756 return true; 1757 } 1758 1759 // We either didn't find a single insertelement, or the insertelement chain 1760 // ended before all elements were covered. Check for other interesting values. 1761 1762 // Constants are always interesting because we can just constant fold the 1763 // extractelements. 1764 if (isa<Constant>(CurVal)) 1765 return true; 1766 1767 // shufflevector is likely to be profitable if either operand is a constant, 1768 // or if either source is in the same block. 1769 // This is because shufflevector is most often lowered as a series of 1770 // insert/extract elements anyway. 1771 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) { 1772 return isa<Constant>(SV->getOperand(1)) || 1773 areInSameBB(SV, SV->getOperand(0)) || 1774 areInSameBB(SV, SV->getOperand(1)); 1775 } 1776 1777 return false; 1778 } 1779 1780 bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) { 1781 // Check in the cache, or add an entry for this node. 1782 // 1783 // We init with false because we consider all PHI nodes unbreakable until we 1784 // reach a conclusion. Doing the opposite - assuming they're break-able until 1785 // proven otherwise - can be harmful in some pathological cases so we're 1786 // conservative for now. 1787 const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false}); 1788 if (!DidInsert) 1789 return It->second; 1790 1791 // This function may recurse, so to guard against infinite looping, this PHI 1792 // is conservatively considered unbreakable until we reach a conclusion. 1793 1794 // Don't break PHIs that have no interesting incoming values. That is, where 1795 // there is no clear opportunity to fold the "extractelement" instructions we 1796 // would add. 1797 // 1798 // Note: IC does not run after this pass, so we're only interested in the 1799 // foldings that the DAG combiner can do. 1800 if (none_of(I.incoming_values(), 1801 [&](Value *V) { return isInterestingPHIIncomingValue(V); })) 1802 return false; 1803 1804 // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI 1805 // node as user, we don't want to break this PHI either because it's unlikely 1806 // to be beneficial. We would just explode the vector and reassemble it 1807 // directly, wasting instructions. 1808 // 1809 // In the case where multiple users are PHI nodes, we want at least half of 1810 // them to be breakable. 1811 int Score = 0; 1812 for (const Value *U : I.users()) { 1813 if (const auto *PU = dyn_cast<PHINode>(U)) 1814 Score += canBreakPHINode(*PU) ? 1 : -1; 1815 } 1816 1817 if (Score < 0) 1818 return false; 1819 1820 return BreakPhiNodesCache[&I] = true; 1821 } 1822 1823 /// Helper class for "break large PHIs" (visitPHINode). 1824 /// 1825 /// This represents a slice of a PHI's incoming value, which is made up of: 1826 /// - The type of the slice (Ty) 1827 /// - The index in the incoming value's vector where the slice starts (Idx) 1828 /// - The number of elements in the slice (NumElts). 1829 /// It also keeps track of the NewPHI node inserted for this particular slice. 1830 /// 1831 /// Slice examples: 1832 /// <4 x i64> -> Split into four i64 slices. 1833 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1] 1834 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail. 1835 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1] 1836 class VectorSlice { 1837 public: 1838 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts) 1839 : Ty(Ty), Idx(Idx), NumElts(NumElts) {} 1840 1841 Type *Ty = nullptr; 1842 unsigned Idx = 0; 1843 unsigned NumElts = 0; 1844 PHINode *NewPHI = nullptr; 1845 1846 /// Slice \p Inc according to the information contained within this slice. 1847 /// This is cached, so if called multiple times for the same \p BB & \p Inc 1848 /// pair, it returns the same Sliced value as well. 1849 /// 1850 /// Note this *intentionally* does not return the same value for, say, 1851 /// [%bb.0, %0] & [%bb.1, %0] as: 1852 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then 1853 /// the value in bb.1 may not be reachable from bb.0 if it's its 1854 /// predecessor.) 1855 /// - We also want to make our extract instructions as local as possible so 1856 /// the DAG has better chances of folding them out. Duplicating them like 1857 /// that is beneficial in that regard. 1858 /// 1859 /// This is both a minor optimization to avoid creating duplicate 1860 /// instructions, but also a requirement for correctness. It is not forbidden 1861 /// for a PHI node to have the same [BB, Val] pair multiple times. If we 1862 /// returned a new value each time, those previously identical pairs would all 1863 /// have different incoming values (from the same block) and it'd cause a "PHI 1864 /// node has multiple entries for the same basic block with different incoming 1865 /// values!" verifier error. 1866 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) { 1867 Value *&Res = SlicedVals[{BB, Inc}]; 1868 if (Res) 1869 return Res; 1870 1871 IRBuilder<> B(BB->getTerminator()); 1872 if (Instruction *IncInst = dyn_cast<Instruction>(Inc)) 1873 B.SetCurrentDebugLocation(IncInst->getDebugLoc()); 1874 1875 if (NumElts > 1) { 1876 SmallVector<int, 4> Mask; 1877 for (unsigned K = Idx; K < (Idx + NumElts); ++K) 1878 Mask.push_back(K); 1879 Res = B.CreateShuffleVector(Inc, Mask, NewValName); 1880 } else 1881 Res = B.CreateExtractElement(Inc, Idx, NewValName); 1882 1883 return Res; 1884 } 1885 1886 private: 1887 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals; 1888 }; 1889 1890 bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { 1891 // Break-up fixed-vector PHIs into smaller pieces. 1892 // Default threshold is 32, so it breaks up any vector that's >32 bits into 1893 // its elements, or into 32-bit pieces (for 8/16 bit elts). 1894 // 1895 // This is only helpful for DAGISel because it doesn't handle large PHIs as 1896 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg. 1897 // With large, odd-sized PHIs we may end up needing many `build_vector` 1898 // operations with most elements being "undef". This inhibits a lot of 1899 // optimization opportunities and can result in unreasonably high register 1900 // pressure and the inevitable stack spilling. 1901 if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) 1902 return false; 1903 1904 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType()); 1905 if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) 1906 return false; 1907 1908 if (!ForceScalarizeLargePHIs && !canBreakPHINode(I)) 1909 return false; 1910 1911 std::vector<VectorSlice> Slices; 1912 1913 Type *EltTy = FVT->getElementType(); 1914 { 1915 unsigned Idx = 0; 1916 // For 8/16 bits type, don't scalarize fully but break it up into as many 1917 // 32-bit slices as we can, and scalarize the tail. 1918 const unsigned EltSize = DL->getTypeSizeInBits(EltTy); 1919 const unsigned NumElts = FVT->getNumElements(); 1920 if (EltSize == 8 || EltSize == 16) { 1921 const unsigned SubVecSize = (32 / EltSize); 1922 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize); 1923 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End; 1924 Idx += SubVecSize) 1925 Slices.emplace_back(SubVecTy, Idx, SubVecSize); 1926 } 1927 1928 // Scalarize all remaining elements. 1929 for (; Idx < NumElts; ++Idx) 1930 Slices.emplace_back(EltTy, Idx, 1); 1931 } 1932 1933 if (Slices.size() == 1) 1934 return false; 1935 1936 // Create one PHI per vector piece. The "VectorSlice" class takes care of 1937 // creating the necessary instruction to extract the relevant slices of each 1938 // incoming value. 1939 IRBuilder<> B(I.getParent()); 1940 B.SetCurrentDebugLocation(I.getDebugLoc()); 1941 1942 unsigned IncNameSuffix = 0; 1943 for (VectorSlice &S : Slices) { 1944 // We need to reset the build on each iteration, because getSlicedVal may 1945 // have inserted something into I's BB. 1946 B.SetInsertPoint(I.getParent()->getFirstNonPHI()); 1947 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues()); 1948 1949 for (const auto &[Idx, BB] : enumerate(I.blocks())) { 1950 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx), 1951 "largephi.extractslice" + 1952 std::to_string(IncNameSuffix++)), 1953 BB); 1954 } 1955 } 1956 1957 // And replace this PHI with a vector of all the previous PHI values. 1958 Value *Vec = PoisonValue::get(FVT); 1959 unsigned NameSuffix = 0; 1960 for (VectorSlice &S : Slices) { 1961 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++); 1962 if (S.NumElts > 1) 1963 Vec = 1964 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName); 1965 else 1966 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName); 1967 } 1968 1969 I.replaceAllUsesWith(Vec); 1970 I.eraseFromParent(); 1971 return true; 1972 } 1973 1974 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { 1975 switch (I.getIntrinsicID()) { 1976 case Intrinsic::bitreverse: 1977 return visitBitreverseIntrinsicInst(I); 1978 case Intrinsic::minnum: 1979 return visitMinNum(I); 1980 default: 1981 return false; 1982 } 1983 } 1984 1985 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 1986 bool Changed = false; 1987 1988 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 1989 UA->isUniform(&I)) 1990 Changed |= promoteUniformBitreverseToI32(I); 1991 1992 return Changed; 1993 } 1994 1995 /// Match non-nan fract pattern. 1996 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0) 1997 /// 1998 /// If fract is a useful instruction for the subtarget. Does not account for the 1999 /// nan handling; the instruction has a nan check on the input value. 2000 Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) { 2001 if (ST->hasFractBug()) 2002 return nullptr; 2003 2004 if (I.getIntrinsicID() != Intrinsic::minnum) 2005 return nullptr; 2006 2007 Type *Ty = I.getType(); 2008 if (!isLegalFloatingTy(Ty->getScalarType())) 2009 return nullptr; 2010 2011 Value *Arg0 = I.getArgOperand(0); 2012 Value *Arg1 = I.getArgOperand(1); 2013 2014 const APFloat *C; 2015 if (!match(Arg1, m_APFloat(C))) 2016 return nullptr; 2017 2018 APFloat One(1.0); 2019 bool LosesInfo; 2020 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo); 2021 2022 // Match nextafter(1.0, -1) 2023 One.next(true); 2024 if (One != *C) 2025 return nullptr; 2026 2027 Value *FloorSrc; 2028 if (match(Arg0, m_FSub(m_Value(FloorSrc), 2029 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc))))) 2030 return FloorSrc; 2031 return nullptr; 2032 } 2033 2034 Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, 2035 Value *FractArg) { 2036 SmallVector<Value *, 4> FractVals; 2037 extractValues(Builder, FractVals, FractArg); 2038 2039 SmallVector<Value *, 4> ResultVals(FractVals.size()); 2040 2041 Type *Ty = FractArg->getType()->getScalarType(); 2042 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) { 2043 ResultVals[I] = 2044 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]}); 2045 } 2046 2047 return insertValues(Builder, FractArg->getType(), ResultVals); 2048 } 2049 2050 bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) { 2051 Value *FractArg = matchFractPat(I); 2052 if (!FractArg) 2053 return false; 2054 2055 // Match pattern for fract intrinsic in contexts where the nan check has been 2056 // optimized out (and hope the knowledge the source can't be nan wasn't lost). 2057 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, *DL, TLInfo)) 2058 return false; 2059 2060 IRBuilder<> Builder(&I); 2061 FastMathFlags FMF = I.getFastMathFlags(); 2062 FMF.setNoNaNs(); 2063 Builder.setFastMathFlags(FMF); 2064 2065 Value *Fract = applyFractPat(Builder, FractArg); 2066 Fract->takeName(&I); 2067 I.replaceAllUsesWith(Fract); 2068 2069 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); 2070 return true; 2071 } 2072 2073 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 2074 Impl.Mod = &M; 2075 Impl.DL = &Impl.Mod->getDataLayout(); 2076 return false; 2077 } 2078 2079 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 2080 if (skipFunction(F)) 2081 return false; 2082 2083 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 2084 if (!TPC) 2085 return false; 2086 2087 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 2088 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 2089 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); 2090 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2091 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 2092 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 2093 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr; 2094 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); 2095 SIModeRegisterDefaults Mode(F); 2096 Impl.HasFP32DenormalFlush = 2097 Mode.FP32Denormals == DenormalMode::getPreserveSign(); 2098 return Impl.run(F); 2099 } 2100 2101 PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F, 2102 FunctionAnalysisManager &FAM) { 2103 AMDGPUCodeGenPrepareImpl Impl; 2104 Impl.Mod = F.getParent(); 2105 Impl.DL = &Impl.Mod->getDataLayout(); 2106 Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); 2107 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); 2108 Impl.AC = &FAM.getResult<AssumptionAnalysis>(F); 2109 Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F); 2110 Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 2111 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); 2112 SIModeRegisterDefaults Mode(F); 2113 Impl.HasFP32DenormalFlush = 2114 Mode.FP32Denormals == DenormalMode::getPreserveSign(); 2115 PreservedAnalyses PA = PreservedAnalyses::none(); 2116 if (!Impl.FlowChanged) 2117 PA.preserveSet<CFGAnalyses>(); 2118 return Impl.run(F) ? PA : PreservedAnalyses::all(); 2119 } 2120 2121 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 2122 "AMDGPU IR optimizations", false, false) 2123 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 2124 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 2125 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 2126 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 2127 false, false) 2128 2129 char AMDGPUCodeGenPrepare::ID = 0; 2130 2131 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 2132 return new AMDGPUCodeGenPrepare(); 2133 } 2134