1 //===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This transformation analyzes and transforms the induction variables (and 10 // computations derived from them) into simpler forms suitable for subsequent 11 // analysis and transformation. 12 // 13 // If the trip count of a loop is computable, this pass also makes the following 14 // changes: 15 // 1. The exit condition for the loop is canonicalized to compare the 16 // induction value against the exit value. This turns loops like: 17 // 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 18 // 2. Any use outside of the loop of an expression derived from the indvar 19 // is changed to compute the derived value outside of the loop, eliminating 20 // the dependence on the exit value of the induction variable. If the only 21 // purpose of the loop is to compute the exit value of some derived 22 // expression, this transformation will make the loop dead. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/LoopInfo.h" 36 #include "llvm/Analysis/LoopPass.h" 37 #include "llvm/Analysis/MemorySSA.h" 38 #include "llvm/Analysis/MemorySSAUpdater.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" 42 #include "llvm/Analysis/TargetLibraryInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/Analysis/ValueTracking.h" 45 #include "llvm/IR/BasicBlock.h" 46 #include "llvm/IR/Constant.h" 47 #include "llvm/IR/ConstantRange.h" 48 #include "llvm/IR/Constants.h" 49 #include "llvm/IR/DataLayout.h" 50 #include "llvm/IR/DerivedTypes.h" 51 #include "llvm/IR/Dominators.h" 52 #include "llvm/IR/Function.h" 53 #include "llvm/IR/IRBuilder.h" 54 #include "llvm/IR/InstrTypes.h" 55 #include "llvm/IR/Instruction.h" 56 #include "llvm/IR/Instructions.h" 57 #include "llvm/IR/Intrinsics.h" 58 #include "llvm/IR/PassManager.h" 59 #include "llvm/IR/PatternMatch.h" 60 #include "llvm/IR/Type.h" 61 #include "llvm/IR/Use.h" 62 #include "llvm/IR/User.h" 63 #include "llvm/IR/Value.h" 64 #include "llvm/IR/ValueHandle.h" 65 #include "llvm/Support/Casting.h" 66 #include "llvm/Support/CommandLine.h" 67 #include "llvm/Support/Debug.h" 68 #include "llvm/Support/MathExtras.h" 69 #include "llvm/Support/raw_ostream.h" 70 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 71 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 72 #include "llvm/Transforms/Utils/Local.h" 73 #include "llvm/Transforms/Utils/LoopUtils.h" 74 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 75 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 76 #include <cassert> 77 #include <cstdint> 78 #include <utility> 79 80 using namespace llvm; 81 using namespace PatternMatch; 82 using namespace SCEVPatternMatch; 83 84 #define DEBUG_TYPE "indvars" 85 86 STATISTIC(NumWidened , "Number of indvars widened"); 87 STATISTIC(NumReplaced , "Number of exit values replaced"); 88 STATISTIC(NumLFTR , "Number of loop exit tests replaced"); 89 STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); 90 STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); 91 92 static cl::opt<ReplaceExitVal> ReplaceExitValue( 93 "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 94 cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 95 cl::values( 96 clEnumValN(NeverRepl, "never", "never replace exit value"), 97 clEnumValN(OnlyCheapRepl, "cheap", 98 "only replace exit value when the cost is cheap"), 99 clEnumValN( 100 UnusedIndVarInLoop, "unusedindvarinloop", 101 "only replace exit value when it is an unused " 102 "induction variable in the loop and has cheap replacement cost"), 103 clEnumValN(NoHardUse, "noharduse", 104 "only replace exit values when loop def likely dead"), 105 clEnumValN(AlwaysRepl, "always", 106 "always replace exit value whenever possible"))); 107 108 static cl::opt<bool> UsePostIncrementRanges( 109 "indvars-post-increment-ranges", cl::Hidden, 110 cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 111 cl::init(true)); 112 113 static cl::opt<bool> 114 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 115 cl::desc("Disable Linear Function Test Replace optimization")); 116 117 static cl::opt<bool> 118 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 119 cl::desc("Predicate conditions in read only loops")); 120 121 static cl::opt<bool> 122 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), 123 cl::desc("Allow widening of indvars to eliminate s/zext")); 124 125 namespace { 126 127 class IndVarSimplify { 128 LoopInfo *LI; 129 ScalarEvolution *SE; 130 DominatorTree *DT; 131 const DataLayout &DL; 132 TargetLibraryInfo *TLI; 133 const TargetTransformInfo *TTI; 134 std::unique_ptr<MemorySSAUpdater> MSSAU; 135 136 SmallVector<WeakTrackingVH, 16> DeadInsts; 137 bool WidenIndVars; 138 139 bool RunUnswitching = false; 140 141 bool handleFloatingPointIV(Loop *L, PHINode *PH); 142 bool rewriteNonIntegerIVs(Loop *L); 143 144 bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 145 /// Try to improve our exit conditions by converting condition from signed 146 /// to unsigned or rotating computation out of the loop. 147 /// (See inline comment about why this is duplicated from simplifyAndExtend) 148 bool canonicalizeExitCondition(Loop *L); 149 /// Try to eliminate loop exits based on analyzeable exit counts 150 bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 151 /// Try to form loop invariant tests for loop exits by changing how many 152 /// iterations of the loop run when that is unobservable. 153 bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 154 155 bool rewriteFirstIterationLoopExitValues(Loop *L); 156 157 bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 158 const SCEV *ExitCount, 159 PHINode *IndVar, SCEVExpander &Rewriter); 160 161 bool sinkUnusedInvariants(Loop *L); 162 163 public: 164 IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 165 const DataLayout &DL, TargetLibraryInfo *TLI, 166 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) 167 : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), 168 WidenIndVars(WidenIndVars) { 169 if (MSSA) 170 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 171 } 172 173 bool run(Loop *L); 174 175 bool runUnswitching() const { return RunUnswitching; } 176 }; 177 178 } // end anonymous namespace 179 180 //===----------------------------------------------------------------------===// 181 // rewriteNonIntegerIVs and helpers. Prefer integer IVs. 182 //===----------------------------------------------------------------------===// 183 184 /// Convert APF to an integer, if possible. 185 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 186 bool isExact = false; 187 // See if we can convert this to an int64_t 188 uint64_t UIntVal; 189 if (APF.convertToInteger(MutableArrayRef(UIntVal), 64, true, 190 APFloat::rmTowardZero, &isExact) != APFloat::opOK || 191 !isExact) 192 return false; 193 IntVal = UIntVal; 194 return true; 195 } 196 197 /// If the loop has floating induction variable then insert corresponding 198 /// integer induction variable if possible. 199 /// For example, 200 /// for(double i = 0; i < 10000; ++i) 201 /// bar(i) 202 /// is converted into 203 /// for(int i = 0; i < 10000; ++i) 204 /// bar((double)i); 205 bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 206 unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 207 unsigned BackEdge = IncomingEdge^1; 208 209 // Check incoming value. 210 auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 211 212 int64_t InitValue; 213 if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 214 return false; 215 216 // Check IV increment. Reject this PN if increment operation is not 217 // an add or increment value can not be represented by an integer. 218 auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 219 if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 220 221 // If this is not an add of the PHI with a constantfp, or if the constant fp 222 // is not an integer, bail out. 223 ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 224 int64_t IncValue; 225 if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 226 !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 227 return false; 228 229 // Check Incr uses. One user is PN and the other user is an exit condition 230 // used by the conditional terminator. 231 Value::user_iterator IncrUse = Incr->user_begin(); 232 Instruction *U1 = cast<Instruction>(*IncrUse++); 233 if (IncrUse == Incr->user_end()) return false; 234 Instruction *U2 = cast<Instruction>(*IncrUse++); 235 if (IncrUse != Incr->user_end()) return false; 236 237 // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't 238 // only used by a branch, we can't transform it. 239 FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 240 if (!Compare) 241 Compare = dyn_cast<FCmpInst>(U2); 242 if (!Compare || !Compare->hasOneUse() || 243 !isa<BranchInst>(Compare->user_back())) 244 return false; 245 246 BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 247 248 // We need to verify that the branch actually controls the iteration count 249 // of the loop. If not, the new IV can overflow and no one will notice. 250 // The branch block must be in the loop and one of the successors must be out 251 // of the loop. 252 assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 253 if (!L->contains(TheBr->getParent()) || 254 (L->contains(TheBr->getSuccessor(0)) && 255 L->contains(TheBr->getSuccessor(1)))) 256 return false; 257 258 // If it isn't a comparison with an integer-as-fp (the exit value), we can't 259 // transform it. 260 ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 261 int64_t ExitValue; 262 if (ExitValueVal == nullptr || 263 !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 264 return false; 265 266 // Find new predicate for integer comparison. 267 CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 268 switch (Compare->getPredicate()) { 269 default: return false; // Unknown comparison. 270 case CmpInst::FCMP_OEQ: 271 case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 272 case CmpInst::FCMP_ONE: 273 case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 274 case CmpInst::FCMP_OGT: 275 case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 276 case CmpInst::FCMP_OGE: 277 case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 278 case CmpInst::FCMP_OLT: 279 case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 280 case CmpInst::FCMP_OLE: 281 case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 282 } 283 284 // We convert the floating point induction variable to a signed i32 value if 285 // we can. This is only safe if the comparison will not overflow in a way 286 // that won't be trapped by the integer equivalent operations. Check for this 287 // now. 288 // TODO: We could use i64 if it is native and the range requires it. 289 290 // The start/stride/exit values must all fit in signed i32. 291 if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 292 return false; 293 294 // If not actually striding (add x, 0.0), avoid touching the code. 295 if (IncValue == 0) 296 return false; 297 298 // Positive and negative strides have different safety conditions. 299 if (IncValue > 0) { 300 // If we have a positive stride, we require the init to be less than the 301 // exit value. 302 if (InitValue >= ExitValue) 303 return false; 304 305 uint32_t Range = uint32_t(ExitValue-InitValue); 306 // Check for infinite loop, either: 307 // while (i <= Exit) or until (i > Exit) 308 if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 309 if (++Range == 0) return false; // Range overflows. 310 } 311 312 unsigned Leftover = Range % uint32_t(IncValue); 313 314 // If this is an equality comparison, we require that the strided value 315 // exactly land on the exit value, otherwise the IV condition will wrap 316 // around and do things the fp IV wouldn't. 317 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 318 Leftover != 0) 319 return false; 320 321 // If the stride would wrap around the i32 before exiting, we can't 322 // transform the IV. 323 if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 324 return false; 325 } else { 326 // If we have a negative stride, we require the init to be greater than the 327 // exit value. 328 if (InitValue <= ExitValue) 329 return false; 330 331 uint32_t Range = uint32_t(InitValue-ExitValue); 332 // Check for infinite loop, either: 333 // while (i >= Exit) or until (i < Exit) 334 if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 335 if (++Range == 0) return false; // Range overflows. 336 } 337 338 unsigned Leftover = Range % uint32_t(-IncValue); 339 340 // If this is an equality comparison, we require that the strided value 341 // exactly land on the exit value, otherwise the IV condition will wrap 342 // around and do things the fp IV wouldn't. 343 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 344 Leftover != 0) 345 return false; 346 347 // If the stride would wrap around the i32 before exiting, we can't 348 // transform the IV. 349 if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 350 return false; 351 } 352 353 IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 354 355 // Insert new integer induction variable. 356 PHINode *NewPHI = 357 PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator()); 358 NewPHI->addIncoming(ConstantInt::getSigned(Int32Ty, InitValue), 359 PN->getIncomingBlock(IncomingEdge)); 360 NewPHI->setDebugLoc(PN->getDebugLoc()); 361 362 Instruction *NewAdd = BinaryOperator::CreateAdd( 363 NewPHI, ConstantInt::getSigned(Int32Ty, IncValue), 364 Incr->getName() + ".int", Incr->getIterator()); 365 NewAdd->setDebugLoc(Incr->getDebugLoc()); 366 NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 367 368 ICmpInst *NewCompare = new ICmpInst( 369 TheBr->getIterator(), NewPred, NewAdd, 370 ConstantInt::getSigned(Int32Ty, ExitValue), Compare->getName()); 371 NewCompare->setDebugLoc(Compare->getDebugLoc()); 372 373 // In the following deletions, PN may become dead and may be deleted. 374 // Use a WeakTrackingVH to observe whether this happens. 375 WeakTrackingVH WeakPH = PN; 376 377 // Delete the old floating point exit comparison. The branch starts using the 378 // new comparison. 379 NewCompare->takeName(Compare); 380 Compare->replaceAllUsesWith(NewCompare); 381 RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 382 383 // Delete the old floating point increment. 384 Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); 385 RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 386 387 // If the FP induction variable still has uses, this is because something else 388 // in the loop uses its value. In order to canonicalize the induction 389 // variable, we chose to eliminate the IV and rewrite it in terms of an 390 // int->fp cast. 391 // 392 // We give preference to sitofp over uitofp because it is faster on most 393 // platforms. 394 if (WeakPH) { 395 Instruction *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 396 PN->getParent()->getFirstInsertionPt()); 397 Conv->setDebugLoc(PN->getDebugLoc()); 398 PN->replaceAllUsesWith(Conv); 399 RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 400 } 401 return true; 402 } 403 404 bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 405 // First step. Check to see if there are any floating-point recurrences. 406 // If there are, change them into integer recurrences, permitting analysis by 407 // the SCEV routines. 408 BasicBlock *Header = L->getHeader(); 409 410 SmallVector<WeakTrackingVH, 8> PHIs(llvm::make_pointer_range(Header->phis())); 411 412 bool Changed = false; 413 for (WeakTrackingVH &PHI : PHIs) 414 if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHI)) 415 Changed |= handleFloatingPointIV(L, PN); 416 417 // If the loop previously had floating-point IV, ScalarEvolution 418 // may not have been able to compute a trip count. Now that we've done some 419 // re-writing, the trip count may be computable. 420 if (Changed) 421 SE->forgetLoop(L); 422 return Changed; 423 } 424 425 //===---------------------------------------------------------------------===// 426 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 427 // they will exit at the first iteration. 428 //===---------------------------------------------------------------------===// 429 430 /// Check to see if this loop has loop invariant conditions which lead to loop 431 /// exits. If so, we know that if the exit path is taken, it is at the first 432 /// loop iteration. This lets us predict exit values of PHI nodes that live in 433 /// loop header. 434 bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 435 // Verify the input to the pass is already in LCSSA form. 436 assert(L->isLCSSAForm(*DT)); 437 438 SmallVector<BasicBlock *, 8> ExitBlocks; 439 L->getUniqueExitBlocks(ExitBlocks); 440 441 bool MadeAnyChanges = false; 442 for (auto *ExitBB : ExitBlocks) { 443 // If there are no more PHI nodes in this exit block, then no more 444 // values defined inside the loop are used on this path. 445 for (PHINode &PN : ExitBB->phis()) { 446 for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 447 IncomingValIdx != E; ++IncomingValIdx) { 448 auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 449 450 // Can we prove that the exit must run on the first iteration if it 451 // runs at all? (i.e. early exits are fine for our purposes, but 452 // traces which lead to this exit being taken on the 2nd iteration 453 // aren't.) Note that this is about whether the exit branch is 454 // executed, not about whether it is taken. 455 if (!L->getLoopLatch() || 456 !DT->dominates(IncomingBB, L->getLoopLatch())) 457 continue; 458 459 // Get condition that leads to the exit path. 460 auto *TermInst = IncomingBB->getTerminator(); 461 462 Value *Cond = nullptr; 463 if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 464 // Must be a conditional branch, otherwise the block 465 // should not be in the loop. 466 Cond = BI->getCondition(); 467 } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 468 Cond = SI->getCondition(); 469 else 470 continue; 471 472 if (!L->isLoopInvariant(Cond)) 473 continue; 474 475 auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 476 477 // Only deal with PHIs in the loop header. 478 if (!ExitVal || ExitVal->getParent() != L->getHeader()) 479 continue; 480 481 // If ExitVal is a PHI on the loop header, then we know its 482 // value along this exit because the exit can only be taken 483 // on the first iteration. 484 auto *LoopPreheader = L->getLoopPreheader(); 485 assert(LoopPreheader && "Invalid loop"); 486 int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 487 if (PreheaderIdx != -1) { 488 assert(ExitVal->getParent() == L->getHeader() && 489 "ExitVal must be in loop header"); 490 MadeAnyChanges = true; 491 PN.setIncomingValue(IncomingValIdx, 492 ExitVal->getIncomingValue(PreheaderIdx)); 493 SE->forgetValue(&PN); 494 } 495 } 496 } 497 } 498 return MadeAnyChanges; 499 } 500 501 //===----------------------------------------------------------------------===// 502 // IV Widening - Extend the width of an IV to cover its widest uses. 503 //===----------------------------------------------------------------------===// 504 505 /// Update information about the induction variable that is extended by this 506 /// sign or zero extend operation. This is used to determine the final width of 507 /// the IV before actually widening it. 508 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, 509 ScalarEvolution *SE, 510 const TargetTransformInfo *TTI) { 511 bool IsSigned = Cast->getOpcode() == Instruction::SExt; 512 if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 513 return; 514 515 Type *Ty = Cast->getType(); 516 uint64_t Width = SE->getTypeSizeInBits(Ty); 517 if (!Cast->getDataLayout().isLegalInteger(Width)) 518 return; 519 520 // Check that `Cast` actually extends the induction variable (we rely on this 521 // later). This takes care of cases where `Cast` is extending a truncation of 522 // the narrow induction variable, and thus can end up being narrower than the 523 // "narrow" induction variable. 524 uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 525 if (NarrowIVWidth >= Width) 526 return; 527 528 // Cast is either an sext or zext up to this point. 529 // We should not widen an indvar if arithmetics on the wider indvar are more 530 // expensive than those on the narrower indvar. We check only the cost of ADD 531 // because at least an ADD is required to increment the induction variable. We 532 // could compute more comprehensively the cost of all instructions on the 533 // induction variable when necessary. 534 if (TTI && 535 TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 536 TTI->getArithmeticInstrCost(Instruction::Add, 537 Cast->getOperand(0)->getType())) { 538 return; 539 } 540 541 if (!WI.WidestNativeType || 542 Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { 543 WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 544 WI.IsSigned = IsSigned; 545 return; 546 } 547 548 // We extend the IV to satisfy the sign of its user(s), or 'signed' 549 // if there are multiple users with both sign- and zero extensions, 550 // in order not to introduce nondeterministic behaviour based on the 551 // unspecified order of a PHI nodes' users-iterator. 552 WI.IsSigned |= IsSigned; 553 } 554 555 //===----------------------------------------------------------------------===// 556 // Live IV Reduction - Minimize IVs live across the loop. 557 //===----------------------------------------------------------------------===// 558 559 //===----------------------------------------------------------------------===// 560 // Simplification of IV users based on SCEV evaluation. 561 //===----------------------------------------------------------------------===// 562 563 namespace { 564 565 class IndVarSimplifyVisitor : public IVVisitor { 566 ScalarEvolution *SE; 567 const TargetTransformInfo *TTI; 568 PHINode *IVPhi; 569 570 public: 571 WideIVInfo WI; 572 573 IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 574 const TargetTransformInfo *TTI, 575 const DominatorTree *DTree) 576 : SE(SCEV), TTI(TTI), IVPhi(IV) { 577 DT = DTree; 578 WI.NarrowIV = IVPhi; 579 } 580 581 // Implement the interface used by simplifyUsersOfIV. 582 void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 583 }; 584 585 } // end anonymous namespace 586 587 /// Iteratively perform simplification on a worklist of IV users. Each 588 /// successive simplification may push more users which may themselves be 589 /// candidates for simplification. 590 /// 591 /// Sign/Zero extend elimination is interleaved with IV simplification. 592 bool IndVarSimplify::simplifyAndExtend(Loop *L, 593 SCEVExpander &Rewriter, 594 LoopInfo *LI) { 595 SmallVector<WideIVInfo, 8> WideIVs; 596 597 auto *GuardDecl = Intrinsic::getDeclarationIfExists( 598 L->getBlocks()[0]->getModule(), Intrinsic::experimental_guard); 599 bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 600 601 SmallVector<PHINode *, 8> LoopPhis( 602 llvm::make_pointer_range(L->getHeader()->phis())); 603 604 // Each round of simplification iterates through the SimplifyIVUsers worklist 605 // for all current phis, then determines whether any IVs can be 606 // widened. Widening adds new phis to LoopPhis, inducing another round of 607 // simplification on the wide IVs. 608 bool Changed = false; 609 while (!LoopPhis.empty()) { 610 // Evaluate as many IV expressions as possible before widening any IVs. This 611 // forces SCEV to set no-wrap flags before evaluating sign/zero 612 // extension. The first time SCEV attempts to normalize sign/zero extension, 613 // the result becomes final. So for the most predictable results, we delay 614 // evaluation of sign/zero extend evaluation until needed, and avoid running 615 // other SCEV based analysis prior to simplifyAndExtend. 616 do { 617 PHINode *CurrIV = LoopPhis.pop_back_val(); 618 619 // Information about sign/zero extensions of CurrIV. 620 IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 621 622 const auto &[C, U] = simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, 623 Rewriter, &Visitor); 624 625 Changed |= C; 626 RunUnswitching |= U; 627 if (Visitor.WI.WidestNativeType) { 628 WideIVs.push_back(Visitor.WI); 629 } 630 } while(!LoopPhis.empty()); 631 632 // Continue if we disallowed widening. 633 if (!WidenIndVars) 634 continue; 635 636 for (; !WideIVs.empty(); WideIVs.pop_back()) { 637 unsigned ElimExt; 638 unsigned Widened; 639 if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter, 640 DT, DeadInsts, ElimExt, Widened, 641 HasGuards, UsePostIncrementRanges)) { 642 NumElimExt += ElimExt; 643 NumWidened += Widened; 644 Changed = true; 645 LoopPhis.push_back(WidePhi); 646 } 647 } 648 } 649 return Changed; 650 } 651 652 //===----------------------------------------------------------------------===// 653 // linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 654 //===----------------------------------------------------------------------===// 655 656 /// Given an Value which is hoped to be part of an add recurance in the given 657 /// loop, return the associated Phi node if so. Otherwise, return null. Note 658 /// that this is less general than SCEVs AddRec checking. 659 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 660 Instruction *IncI = dyn_cast<Instruction>(IncV); 661 if (!IncI) 662 return nullptr; 663 664 switch (IncI->getOpcode()) { 665 case Instruction::Add: 666 case Instruction::Sub: 667 break; 668 case Instruction::GetElementPtr: 669 // An IV counter must preserve its type. 670 if (IncI->getNumOperands() == 2) 671 break; 672 [[fallthrough]]; 673 default: 674 return nullptr; 675 } 676 677 PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 678 if (Phi && Phi->getParent() == L->getHeader()) { 679 if (L->isLoopInvariant(IncI->getOperand(1))) 680 return Phi; 681 return nullptr; 682 } 683 if (IncI->getOpcode() == Instruction::GetElementPtr) 684 return nullptr; 685 686 // Allow add/sub to be commuted. 687 Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 688 if (Phi && Phi->getParent() == L->getHeader()) { 689 if (L->isLoopInvariant(IncI->getOperand(0))) 690 return Phi; 691 } 692 return nullptr; 693 } 694 695 /// Whether the current loop exit test is based on this value. Currently this 696 /// is limited to a direct use in the loop condition. 697 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 698 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 699 ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 700 // TODO: Allow non-icmp loop test. 701 if (!ICmp) 702 return false; 703 704 // TODO: Allow indirect use. 705 return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 706 } 707 708 /// linearFunctionTestReplace policy. Return true unless we can show that the 709 /// current exit test is already sufficiently canonical. 710 static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 711 assert(L->getLoopLatch() && "Must be in simplified form"); 712 713 // Avoid converting a constant or loop invariant test back to a runtime 714 // test. This is critical for when SCEV's cached ExitCount is less precise 715 // than the current IR (such as after we've proven a particular exit is 716 // actually dead and thus the BE count never reaches our ExitCount.) 717 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 718 if (L->isLoopInvariant(BI->getCondition())) 719 return false; 720 721 // Do LFTR to simplify the exit condition to an ICMP. 722 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 723 if (!Cond) 724 return true; 725 726 // Do LFTR to simplify the exit ICMP to EQ/NE 727 ICmpInst::Predicate Pred = Cond->getPredicate(); 728 if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 729 return true; 730 731 // Look for a loop invariant RHS 732 Value *LHS = Cond->getOperand(0); 733 Value *RHS = Cond->getOperand(1); 734 if (!L->isLoopInvariant(RHS)) { 735 if (!L->isLoopInvariant(LHS)) 736 return true; 737 std::swap(LHS, RHS); 738 } 739 // Look for a simple IV counter LHS 740 PHINode *Phi = dyn_cast<PHINode>(LHS); 741 if (!Phi) 742 Phi = getLoopPhiForCounter(LHS, L); 743 744 if (!Phi) 745 return true; 746 747 // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 748 int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 749 if (Idx < 0) 750 return true; 751 752 // Do LFTR if the exit condition's IV is *not* a simple counter. 753 Value *IncV = Phi->getIncomingValue(Idx); 754 return Phi != getLoopPhiForCounter(IncV, L); 755 } 756 757 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 758 /// down to checking that all operands are constant and listing instructions 759 /// that may hide undef. 760 static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 761 unsigned Depth) { 762 if (isa<Constant>(V)) 763 return !isa<UndefValue>(V); 764 765 if (Depth >= 6) 766 return false; 767 768 // Conservatively handle non-constant non-instructions. For example, Arguments 769 // may be undef. 770 Instruction *I = dyn_cast<Instruction>(V); 771 if (!I) 772 return false; 773 774 // Load and return values may be undef. 775 if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 776 return false; 777 778 // Optimistically handle other instructions. 779 for (Value *Op : I->operands()) { 780 if (!Visited.insert(Op).second) 781 continue; 782 if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 783 return false; 784 } 785 return true; 786 } 787 788 /// Return true if the given value is concrete. We must prove that undef can 789 /// never reach it. 790 /// 791 /// TODO: If we decide that this is a good approach to checking for undef, we 792 /// may factor it into a common location. 793 static bool hasConcreteDef(Value *V) { 794 SmallPtrSet<Value*, 8> Visited; 795 Visited.insert(V); 796 return hasConcreteDefImpl(V, Visited, 0); 797 } 798 799 /// Return true if the given phi is a "counter" in L. A counter is an 800 /// add recurance (of integer or pointer type) with an arbitrary start, and a 801 /// step of 1. Note that L must have exactly one latch. 802 static bool isLoopCounter(PHINode* Phi, Loop *L, 803 ScalarEvolution *SE) { 804 assert(Phi->getParent() == L->getHeader()); 805 assert(L->getLoopLatch()); 806 807 if (!SE->isSCEVable(Phi->getType())) 808 return false; 809 810 const SCEV *S = SE->getSCEV(Phi); 811 if (!match(S, m_scev_AffineAddRec(m_SCEV(), m_scev_One(), m_SpecificLoop(L)))) 812 return false; 813 814 int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 815 Value *IncV = Phi->getIncomingValue(LatchIdx); 816 return (getLoopPhiForCounter(IncV, L) == Phi && 817 isa<SCEVAddRecExpr>(SE->getSCEV(IncV))); 818 } 819 820 /// Search the loop header for a loop counter (anadd rec w/step of one) 821 /// suitable for use by LFTR. If multiple counters are available, select the 822 /// "best" one based profitable heuristics. 823 /// 824 /// BECount may be an i8* pointer type. The pointer difference is already 825 /// valid count without scaling the address stride, so it remains a pointer 826 /// expression as far as SCEV is concerned. 827 static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 828 const SCEV *BECount, 829 ScalarEvolution *SE, DominatorTree *DT) { 830 uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 831 832 Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 833 834 // Loop over all of the PHI nodes, looking for a simple counter. 835 PHINode *BestPhi = nullptr; 836 const SCEV *BestInit = nullptr; 837 BasicBlock *LatchBlock = L->getLoopLatch(); 838 assert(LatchBlock && "Must be in simplified form"); 839 const DataLayout &DL = L->getHeader()->getDataLayout(); 840 841 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 842 PHINode *Phi = cast<PHINode>(I); 843 if (!isLoopCounter(Phi, L, SE)) 844 continue; 845 846 const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 847 848 // AR may be a pointer type, while BECount is an integer type. 849 // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 850 // AR may not be a narrower type, or we may never exit. 851 uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 852 if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 853 continue; 854 855 // Avoid reusing a potentially undef value to compute other values that may 856 // have originally had a concrete definition. 857 if (!hasConcreteDef(Phi)) { 858 // We explicitly allow unknown phis as long as they are already used by 859 // the loop exit test. This is legal since performing LFTR could not 860 // increase the number of undef users. 861 Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 862 if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 863 !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 864 continue; 865 } 866 867 // Avoid introducing undefined behavior due to poison which didn't exist in 868 // the original program. (Annoyingly, the rules for poison and undef 869 // propagation are distinct, so this does NOT cover the undef case above.) 870 // We have to ensure that we don't introduce UB by introducing a use on an 871 // iteration where said IV produces poison. Our strategy here differs for 872 // pointers and integer IVs. For integers, we strip and reinfer as needed, 873 // see code in linearFunctionTestReplace. For pointers, we restrict 874 // transforms as there is no good way to reinfer inbounds once lost. 875 if (!Phi->getType()->isIntegerTy() && 876 !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 877 continue; 878 879 const SCEV *Init = AR->getStart(); 880 881 if (BestPhi && !isAlmostDeadIV(BestPhi, LatchBlock, Cond)) { 882 // Don't force a live loop counter if another IV can be used. 883 if (isAlmostDeadIV(Phi, LatchBlock, Cond)) 884 continue; 885 886 // Prefer to count-from-zero. This is a more "canonical" counter form. It 887 // also prefers integer to pointer IVs. 888 if (BestInit->isZero() != Init->isZero()) { 889 if (BestInit->isZero()) 890 continue; 891 } 892 // If two IVs both count from zero or both count from nonzero then the 893 // narrower is likely a dead phi that has been widened. Use the wider phi 894 // to allow the other to be eliminated. 895 else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 896 continue; 897 } 898 BestPhi = Phi; 899 BestInit = Init; 900 } 901 return BestPhi; 902 } 903 904 /// Insert an IR expression which computes the value held by the IV IndVar 905 /// (which must be an loop counter w/unit stride) after the backedge of loop L 906 /// is taken ExitCount times. 907 static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 908 const SCEV *ExitCount, bool UsePostInc, Loop *L, 909 SCEVExpander &Rewriter, ScalarEvolution *SE) { 910 assert(isLoopCounter(IndVar, L, SE)); 911 assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer"); 912 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 913 assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 914 915 // For integer IVs, truncate the IV before computing the limit unless we 916 // know apriori that the limit must be a constant when evaluated in the 917 // bitwidth of the IV. We prefer (potentially) keeping a truncate of the 918 // IV in the loop over a (potentially) expensive expansion of the widened 919 // exit count add(zext(add)) expression. 920 if (IndVar->getType()->isIntegerTy() && 921 SE->getTypeSizeInBits(AR->getType()) > 922 SE->getTypeSizeInBits(ExitCount->getType())) { 923 const SCEV *IVInit = AR->getStart(); 924 if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount)) 925 AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType())); 926 } 927 928 const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR; 929 const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE); 930 assert(SE->isLoopInvariant(IVLimit, L) && 931 "Computed iteration count is not loop invariant!"); 932 return Rewriter.expandCodeFor(IVLimit, ARBase->getType(), 933 ExitingBB->getTerminator()); 934 } 935 936 /// This method rewrites the exit condition of the loop to be a canonical != 937 /// comparison against the incremented loop induction variable. This pass is 938 /// able to rewrite the exit tests of any loop where the SCEV analysis can 939 /// determine a loop-invariant trip count of the loop, which is actually a much 940 /// broader range than just linear tests. 941 bool IndVarSimplify:: 942 linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 943 const SCEV *ExitCount, 944 PHINode *IndVar, SCEVExpander &Rewriter) { 945 assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 946 assert(isLoopCounter(IndVar, L, SE)); 947 Instruction * const IncVar = 948 cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 949 950 // Initialize CmpIndVar to the preincremented IV. 951 Value *CmpIndVar = IndVar; 952 bool UsePostInc = false; 953 954 // If the exiting block is the same as the backedge block, we prefer to 955 // compare against the post-incremented value, otherwise we must compare 956 // against the preincremented value. 957 if (ExitingBB == L->getLoopLatch()) { 958 // For pointer IVs, we chose to not strip inbounds which requires us not 959 // to add a potentially UB introducing use. We need to either a) show 960 // the loop test we're modifying is already in post-inc form, or b) show 961 // that adding a use must not introduce UB. 962 bool SafeToPostInc = 963 IndVar->getType()->isIntegerTy() || 964 isLoopExitTestBasedOn(IncVar, ExitingBB) || 965 mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 966 if (SafeToPostInc) { 967 UsePostInc = true; 968 CmpIndVar = IncVar; 969 } 970 } 971 972 // It may be necessary to drop nowrap flags on the incrementing instruction 973 // if either LFTR moves from a pre-inc check to a post-inc check (in which 974 // case the increment might have previously been poison on the last iteration 975 // only) or if LFTR switches to a different IV that was previously dynamically 976 // dead (and as such may be arbitrarily poison). We remove any nowrap flags 977 // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 978 // check), because the pre-inc addrec flags may be adopted from the original 979 // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 980 // TODO: This handling is inaccurate for one case: If we switch to a 981 // dynamically dead IV that wraps on the first loop iteration only, which is 982 // not covered by the post-inc addrec. (If the new IV was not dynamically 983 // dead, it could not be poison on the first iteration in the first place.) 984 if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 985 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 986 if (BO->hasNoUnsignedWrap()) 987 BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 988 if (BO->hasNoSignedWrap()) 989 BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 990 } 991 992 Value *ExitCnt = genLoopLimit( 993 IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 994 assert(ExitCnt->getType()->isPointerTy() == 995 IndVar->getType()->isPointerTy() && 996 "genLoopLimit missed a cast"); 997 998 // Insert a new icmp_ne or icmp_eq instruction before the branch. 999 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1000 ICmpInst::Predicate P; 1001 if (L->contains(BI->getSuccessor(0))) 1002 P = ICmpInst::ICMP_NE; 1003 else 1004 P = ICmpInst::ICMP_EQ; 1005 1006 IRBuilder<> Builder(BI); 1007 1008 // The new loop exit condition should reuse the debug location of the 1009 // original loop exit condition. 1010 if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 1011 Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 1012 1013 // For integer IVs, if we evaluated the limit in the narrower bitwidth to 1014 // avoid the expensive expansion of the limit expression in the wider type, 1015 // emit a truncate to narrow the IV to the ExitCount type. This is safe 1016 // since we know (from the exit count bitwidth), that we can't self-wrap in 1017 // the narrower type. 1018 unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 1019 unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 1020 if (CmpIndVarSize > ExitCntSize) { 1021 assert(!CmpIndVar->getType()->isPointerTy() && 1022 !ExitCnt->getType()->isPointerTy()); 1023 1024 // Before resorting to actually inserting the truncate, use the same 1025 // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 1026 // the other side of the comparison instead. We still evaluate the limit 1027 // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 1028 // a truncate within in. 1029 bool Extended = false; 1030 const SCEV *IV = SE->getSCEV(CmpIndVar); 1031 const SCEV *TruncatedIV = SE->getTruncateExpr(IV, ExitCnt->getType()); 1032 const SCEV *ZExtTrunc = 1033 SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 1034 1035 if (ZExtTrunc == IV) { 1036 Extended = true; 1037 ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 1038 "wide.trip.count"); 1039 } else { 1040 const SCEV *SExtTrunc = 1041 SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 1042 if (SExtTrunc == IV) { 1043 Extended = true; 1044 ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 1045 "wide.trip.count"); 1046 } 1047 } 1048 1049 if (Extended) { 1050 bool Discard; 1051 L->makeLoopInvariant(ExitCnt, Discard); 1052 } else 1053 CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 1054 "lftr.wideiv"); 1055 } 1056 LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 1057 << " LHS:" << *CmpIndVar << '\n' 1058 << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 1059 << "\n" 1060 << " RHS:\t" << *ExitCnt << "\n" 1061 << "ExitCount:\t" << *ExitCount << "\n" 1062 << " was: " << *BI->getCondition() << "\n"); 1063 1064 Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 1065 Value *OrigCond = BI->getCondition(); 1066 // It's tempting to use replaceAllUsesWith here to fully replace the old 1067 // comparison, but that's not immediately safe, since users of the old 1068 // comparison may not be dominated by the new comparison. Instead, just 1069 // update the branch to use the new comparison; in the common case this 1070 // will make old comparison dead. 1071 BI->setCondition(Cond); 1072 DeadInsts.emplace_back(OrigCond); 1073 1074 ++NumLFTR; 1075 return true; 1076 } 1077 1078 //===----------------------------------------------------------------------===// 1079 // sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 1080 //===----------------------------------------------------------------------===// 1081 1082 /// If there's a single exit block, sink any loop-invariant values that 1083 /// were defined in the preheader but not used inside the loop into the 1084 /// exit block to reduce register pressure in the loop. 1085 bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 1086 BasicBlock *ExitBlock = L->getExitBlock(); 1087 if (!ExitBlock) return false; 1088 1089 BasicBlock *Preheader = L->getLoopPreheader(); 1090 if (!Preheader) return false; 1091 1092 bool MadeAnyChanges = false; 1093 for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { 1094 1095 // Skip BB Terminator. 1096 if (Preheader->getTerminator() == &I) 1097 continue; 1098 1099 // New instructions were inserted at the end of the preheader. 1100 if (isa<PHINode>(I)) 1101 break; 1102 1103 // Don't move instructions which might have side effects, since the side 1104 // effects need to complete before instructions inside the loop. Also don't 1105 // move instructions which might read memory, since the loop may modify 1106 // memory. Note that it's okay if the instruction might have undefined 1107 // behavior: LoopSimplify guarantees that the preheader dominates the exit 1108 // block. 1109 if (I.mayHaveSideEffects() || I.mayReadFromMemory()) 1110 continue; 1111 1112 // Skip debug or pseudo instructions. 1113 if (I.isDebugOrPseudoInst()) 1114 continue; 1115 1116 // Skip eh pad instructions. 1117 if (I.isEHPad()) 1118 continue; 1119 1120 // Don't sink alloca: we never want to sink static alloca's out of the 1121 // entry block, and correctly sinking dynamic alloca's requires 1122 // checks for stacksave/stackrestore intrinsics. 1123 // FIXME: Refactor this check somehow? 1124 if (isa<AllocaInst>(&I)) 1125 continue; 1126 1127 // Determine if there is a use in or before the loop (direct or 1128 // otherwise). 1129 bool UsedInLoop = false; 1130 for (Use &U : I.uses()) { 1131 Instruction *User = cast<Instruction>(U.getUser()); 1132 BasicBlock *UseBB = User->getParent(); 1133 if (PHINode *P = dyn_cast<PHINode>(User)) { 1134 unsigned i = 1135 PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 1136 UseBB = P->getIncomingBlock(i); 1137 } 1138 if (UseBB == Preheader || L->contains(UseBB)) { 1139 UsedInLoop = true; 1140 break; 1141 } 1142 } 1143 1144 // If there is, the def must remain in the preheader. 1145 if (UsedInLoop) 1146 continue; 1147 1148 // Otherwise, sink it to the exit block. 1149 I.moveBefore(ExitBlock->getFirstInsertionPt()); 1150 SE->forgetValue(&I); 1151 MadeAnyChanges = true; 1152 } 1153 1154 return MadeAnyChanges; 1155 } 1156 1157 static void replaceExitCond(BranchInst *BI, Value *NewCond, 1158 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1159 auto *OldCond = BI->getCondition(); 1160 LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI 1161 << " with " << *NewCond << "\n"); 1162 BI->setCondition(NewCond); 1163 if (OldCond->use_empty()) 1164 DeadInsts.emplace_back(OldCond); 1165 } 1166 1167 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB, 1168 bool IsTaken) { 1169 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1170 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1171 auto *OldCond = BI->getCondition(); 1172 return ConstantInt::get(OldCond->getType(), 1173 IsTaken ? ExitIfTrue : !ExitIfTrue); 1174 } 1175 1176 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, 1177 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1178 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1179 auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken); 1180 replaceExitCond(BI, NewCond, DeadInsts); 1181 } 1182 1183 static void replaceLoopPHINodesWithPreheaderValues( 1184 LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts, 1185 ScalarEvolution &SE) { 1186 assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); 1187 auto *LoopPreheader = L->getLoopPreheader(); 1188 auto *LoopHeader = L->getHeader(); 1189 SmallVector<Instruction *> Worklist; 1190 for (auto &PN : LoopHeader->phis()) { 1191 auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); 1192 for (User *U : PN.users()) 1193 Worklist.push_back(cast<Instruction>(U)); 1194 SE.forgetValue(&PN); 1195 PN.replaceAllUsesWith(PreheaderIncoming); 1196 DeadInsts.emplace_back(&PN); 1197 } 1198 1199 // Replacing with the preheader value will often allow IV users to simplify 1200 // (especially if the preheader value is a constant). 1201 SmallPtrSet<Instruction *, 16> Visited; 1202 while (!Worklist.empty()) { 1203 auto *I = cast<Instruction>(Worklist.pop_back_val()); 1204 if (!Visited.insert(I).second) 1205 continue; 1206 1207 // Don't simplify instructions outside the loop. 1208 if (!L->contains(I)) 1209 continue; 1210 1211 Value *Res = simplifyInstruction(I, I->getDataLayout()); 1212 if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { 1213 for (User *U : I->users()) 1214 Worklist.push_back(cast<Instruction>(U)); 1215 I->replaceAllUsesWith(Res); 1216 DeadInsts.emplace_back(I); 1217 } 1218 } 1219 } 1220 1221 static Value * 1222 createInvariantCond(const Loop *L, BasicBlock *ExitingBB, 1223 const ScalarEvolution::LoopInvariantPredicate &LIP, 1224 SCEVExpander &Rewriter) { 1225 ICmpInst::Predicate InvariantPred = LIP.Pred; 1226 BasicBlock *Preheader = L->getLoopPreheader(); 1227 assert(Preheader && "Preheader doesn't exist"); 1228 Rewriter.setInsertPoint(Preheader->getTerminator()); 1229 auto *LHSV = Rewriter.expandCodeFor(LIP.LHS); 1230 auto *RHSV = Rewriter.expandCodeFor(LIP.RHS); 1231 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1232 if (ExitIfTrue) 1233 InvariantPred = ICmpInst::getInversePredicate(InvariantPred); 1234 IRBuilder<> Builder(Preheader->getTerminator()); 1235 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1236 return Builder.CreateICmp(InvariantPred, LHSV, RHSV, 1237 BI->getCondition()->getName()); 1238 } 1239 1240 static std::optional<Value *> 1241 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB, 1242 const SCEV *MaxIter, bool Inverted, bool SkipLastIter, 1243 ScalarEvolution *SE, SCEVExpander &Rewriter) { 1244 CmpPredicate Pred = ICmp->getCmpPredicate(); 1245 Value *LHS = ICmp->getOperand(0); 1246 Value *RHS = ICmp->getOperand(1); 1247 1248 // 'LHS pred RHS' should now mean that we stay in loop. 1249 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1250 if (Inverted) 1251 Pred = ICmpInst::getInverseCmpPredicate(Pred); 1252 1253 const SCEV *LHSS = SE->getSCEVAtScope(LHS, L); 1254 const SCEV *RHSS = SE->getSCEVAtScope(RHS, L); 1255 // Can we prove it to be trivially true or false? 1256 if (auto EV = SE->evaluatePredicateAt(Pred, LHSS, RHSS, BI)) 1257 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ !*EV); 1258 1259 auto *ARTy = LHSS->getType(); 1260 auto *MaxIterTy = MaxIter->getType(); 1261 // If possible, adjust types. 1262 if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy)) 1263 MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy); 1264 else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) { 1265 const SCEV *MinusOne = SE->getMinusOne(ARTy); 1266 const SCEV *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy); 1267 if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI)) 1268 MaxIter = SE->getTruncateExpr(MaxIter, ARTy); 1269 } 1270 1271 if (SkipLastIter) { 1272 // Semantically skip last iter is "subtract 1, do not bother about unsigned 1273 // wrap". getLoopInvariantExitCondDuringFirstIterations knows how to deal 1274 // with umin in a smart way, but umin(a, b) - 1 will likely not simplify. 1275 // So we manually construct umin(a - 1, b - 1). 1276 SmallVector<const SCEV *, 4> Elements; 1277 if (auto *UMin = dyn_cast<SCEVUMinExpr>(MaxIter)) { 1278 for (const SCEV *Op : UMin->operands()) 1279 Elements.push_back(SE->getMinusSCEV(Op, SE->getOne(Op->getType()))); 1280 MaxIter = SE->getUMinFromMismatchedTypes(Elements); 1281 } else 1282 MaxIter = SE->getMinusSCEV(MaxIter, SE->getOne(MaxIter->getType())); 1283 } 1284 1285 // Check if there is a loop-invariant predicate equivalent to our check. 1286 auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS, 1287 L, BI, MaxIter); 1288 if (!LIP) 1289 return std::nullopt; 1290 1291 // Can we prove it to be trivially true? 1292 if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI)) 1293 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ false); 1294 else 1295 return createInvariantCond(L, ExitingBB, *LIP, Rewriter); 1296 } 1297 1298 static bool optimizeLoopExitWithUnknownExitCount( 1299 const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter, 1300 bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter, 1301 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1302 assert( 1303 (L->contains(BI->getSuccessor(0)) != L->contains(BI->getSuccessor(1))) && 1304 "Not a loop exit!"); 1305 1306 // For branch that stays in loop by TRUE condition, go through AND. For branch 1307 // that stays in loop by FALSE condition, go through OR. Both gives the 1308 // similar logic: "stay in loop iff all conditions are true(false)". 1309 bool Inverted = L->contains(BI->getSuccessor(1)); 1310 SmallVector<ICmpInst *, 4> LeafConditions; 1311 SmallVector<Value *, 4> Worklist; 1312 SmallPtrSet<Value *, 4> Visited; 1313 Value *OldCond = BI->getCondition(); 1314 Visited.insert(OldCond); 1315 Worklist.push_back(OldCond); 1316 1317 auto GoThrough = [&](Value *V) { 1318 Value *LHS = nullptr, *RHS = nullptr; 1319 if (Inverted) { 1320 if (!match(V, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) 1321 return false; 1322 } else { 1323 if (!match(V, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) 1324 return false; 1325 } 1326 if (Visited.insert(LHS).second) 1327 Worklist.push_back(LHS); 1328 if (Visited.insert(RHS).second) 1329 Worklist.push_back(RHS); 1330 return true; 1331 }; 1332 1333 do { 1334 Value *Curr = Worklist.pop_back_val(); 1335 // Go through AND/OR conditions. Collect leaf ICMPs. We only care about 1336 // those with one use, to avoid instruction duplication. 1337 if (Curr->hasOneUse()) 1338 if (!GoThrough(Curr)) 1339 if (auto *ICmp = dyn_cast<ICmpInst>(Curr)) 1340 LeafConditions.push_back(ICmp); 1341 } while (!Worklist.empty()); 1342 1343 // If the current basic block has the same exit count as the whole loop, and 1344 // it consists of multiple icmp's, try to collect all icmp's that give exact 1345 // same exit count. For all other icmp's, we could use one less iteration, 1346 // because their value on the last iteration doesn't really matter. 1347 SmallPtrSet<ICmpInst *, 4> ICmpsFailingOnLastIter; 1348 if (!SkipLastIter && LeafConditions.size() > 1 && 1349 SE->getExitCount(L, ExitingBB, 1350 ScalarEvolution::ExitCountKind::SymbolicMaximum) == 1351 MaxIter) 1352 for (auto *ICmp : LeafConditions) { 1353 auto EL = SE->computeExitLimitFromCond(L, ICmp, Inverted, 1354 /*ControlsExit*/ false); 1355 const SCEV *ExitMax = EL.SymbolicMaxNotTaken; 1356 if (isa<SCEVCouldNotCompute>(ExitMax)) 1357 continue; 1358 // They could be of different types (specifically this happens after 1359 // IV widening). 1360 auto *WiderType = 1361 SE->getWiderType(ExitMax->getType(), MaxIter->getType()); 1362 const SCEV *WideExitMax = SE->getNoopOrZeroExtend(ExitMax, WiderType); 1363 const SCEV *WideMaxIter = SE->getNoopOrZeroExtend(MaxIter, WiderType); 1364 if (WideExitMax == WideMaxIter) 1365 ICmpsFailingOnLastIter.insert(ICmp); 1366 } 1367 1368 bool Changed = false; 1369 for (auto *OldCond : LeafConditions) { 1370 // Skip last iteration for this icmp under one of two conditions: 1371 // - We do it for all conditions; 1372 // - There is another ICmp that would fail on last iter, so this one doesn't 1373 // really matter. 1374 bool OptimisticSkipLastIter = SkipLastIter; 1375 if (!OptimisticSkipLastIter) { 1376 if (ICmpsFailingOnLastIter.size() > 1) 1377 OptimisticSkipLastIter = true; 1378 else if (ICmpsFailingOnLastIter.size() == 1) 1379 OptimisticSkipLastIter = !ICmpsFailingOnLastIter.count(OldCond); 1380 } 1381 if (auto Replaced = 1382 createReplacement(OldCond, L, ExitingBB, MaxIter, Inverted, 1383 OptimisticSkipLastIter, SE, Rewriter)) { 1384 Changed = true; 1385 auto *NewCond = *Replaced; 1386 if (auto *NCI = dyn_cast<Instruction>(NewCond)) { 1387 NCI->setName(OldCond->getName() + ".first_iter"); 1388 } 1389 LLVM_DEBUG(dbgs() << "Unknown exit count: Replacing " << *OldCond 1390 << " with " << *NewCond << "\n"); 1391 assert(OldCond->hasOneUse() && "Must be!"); 1392 OldCond->replaceAllUsesWith(NewCond); 1393 DeadInsts.push_back(OldCond); 1394 // Make sure we no longer consider this condition as failing on last 1395 // iteration. 1396 ICmpsFailingOnLastIter.erase(OldCond); 1397 } 1398 } 1399 return Changed; 1400 } 1401 1402 bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { 1403 // Note: This is duplicating a particular part on SimplifyIndVars reasoning. 1404 // We need to duplicate it because given icmp zext(small-iv), C, IVUsers 1405 // never reaches the icmp since the zext doesn't fold to an AddRec unless 1406 // it already has flags. The alternative to this would be to extending the 1407 // set of "interesting" IV users to include the icmp, but doing that 1408 // regresses results in practice by querying SCEVs before trip counts which 1409 // rely on them which results in SCEV caching sub-optimal answers. The 1410 // concern about caching sub-optimal results is why we only query SCEVs of 1411 // the loop invariant RHS here. 1412 SmallVector<BasicBlock*, 16> ExitingBlocks; 1413 L->getExitingBlocks(ExitingBlocks); 1414 bool Changed = false; 1415 for (auto *ExitingBB : ExitingBlocks) { 1416 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1417 if (!BI) 1418 continue; 1419 assert(BI->isConditional() && "exit branch must be conditional"); 1420 1421 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1422 if (!ICmp || !ICmp->hasOneUse()) 1423 continue; 1424 1425 auto *LHS = ICmp->getOperand(0); 1426 auto *RHS = ICmp->getOperand(1); 1427 // For the range reasoning, avoid computing SCEVs in the loop to avoid 1428 // poisoning cache with sub-optimal results. For the must-execute case, 1429 // this is a neccessary precondition for correctness. 1430 if (!L->isLoopInvariant(RHS)) { 1431 if (!L->isLoopInvariant(LHS)) 1432 continue; 1433 // Same logic applies for the inverse case 1434 std::swap(LHS, RHS); 1435 } 1436 1437 // Match (icmp signed-cond zext, RHS) 1438 Value *LHSOp = nullptr; 1439 if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) 1440 continue; 1441 1442 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1443 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1444 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1445 FullCR = FullCR.zeroExtend(OuterBitWidth); 1446 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1447 if (FullCR.contains(RHSCR)) { 1448 // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus 1449 // replace the signed condition with the unsigned version. 1450 ICmp->setPredicate(ICmp->getUnsignedPredicate()); 1451 Changed = true; 1452 // Note: No SCEV invalidation needed. We've changed the predicate, but 1453 // have not changed exit counts, or the values produced by the compare. 1454 continue; 1455 } 1456 } 1457 1458 // Now that we've canonicalized the condition to match the extend, 1459 // see if we can rotate the extend out of the loop. 1460 for (auto *ExitingBB : ExitingBlocks) { 1461 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1462 if (!BI) 1463 continue; 1464 assert(BI->isConditional() && "exit branch must be conditional"); 1465 1466 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1467 if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) 1468 continue; 1469 1470 bool Swapped = false; 1471 auto *LHS = ICmp->getOperand(0); 1472 auto *RHS = ICmp->getOperand(1); 1473 if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) 1474 // Nothing to rotate 1475 continue; 1476 if (L->isLoopInvariant(LHS)) { 1477 // Same logic applies for the inverse case until we actually pick 1478 // which operand of the compare to update. 1479 Swapped = true; 1480 std::swap(LHS, RHS); 1481 } 1482 assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); 1483 1484 // Match (icmp unsigned-cond zext, RHS) 1485 // TODO: Extend to handle corresponding sext/signed-cmp case 1486 // TODO: Extend to other invertible functions 1487 Value *LHSOp = nullptr; 1488 if (!match(LHS, m_ZExt(m_Value(LHSOp)))) 1489 continue; 1490 1491 // In general, we only rotate if we can do so without increasing the number 1492 // of instructions. The exception is when we have an zext(add-rec). The 1493 // reason for allowing this exception is that we know we need to get rid 1494 // of the zext for SCEV to be able to compute a trip count for said loops; 1495 // we consider the new trip count valuable enough to increase instruction 1496 // count by one. 1497 if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) 1498 continue; 1499 1500 // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS 1501 // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) 1502 // when zext is loop varying and RHS is loop invariant. This converts 1503 // loop varying work to loop-invariant work. 1504 auto doRotateTransform = [&]() { 1505 assert(ICmp->isUnsigned() && "must have proven unsigned already"); 1506 auto *NewRHS = CastInst::Create( 1507 Instruction::Trunc, RHS, LHSOp->getType(), "", 1508 L->getLoopPreheader()->getTerminator()->getIterator()); 1509 // NewRHS is an operation that has been hoisted out of the loop, and 1510 // therefore should have a dropped location. 1511 NewRHS->setDebugLoc(DebugLoc::getDropped()); 1512 ICmp->setOperand(Swapped ? 1 : 0, LHSOp); 1513 ICmp->setOperand(Swapped ? 0 : 1, NewRHS); 1514 // Samesign flag cannot be preserved after narrowing the compare. 1515 ICmp->setSameSign(false); 1516 if (LHS->use_empty()) 1517 DeadInsts.push_back(LHS); 1518 }; 1519 1520 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1521 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1522 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1523 FullCR = FullCR.zeroExtend(OuterBitWidth); 1524 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1525 if (FullCR.contains(RHSCR)) { 1526 doRotateTransform(); 1527 Changed = true; 1528 // Note, we are leaving SCEV in an unfortunately imprecise case here 1529 // as rotation tends to reveal information about trip counts not 1530 // previously visible. 1531 continue; 1532 } 1533 } 1534 1535 return Changed; 1536 } 1537 1538 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 1539 SmallVector<BasicBlock*, 16> ExitingBlocks; 1540 L->getExitingBlocks(ExitingBlocks); 1541 1542 // Remove all exits which aren't both rewriteable and execute on every 1543 // iteration. 1544 llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1545 // If our exitting block exits multiple loops, we can only rewrite the 1546 // innermost one. Otherwise, we're changing how many times the innermost 1547 // loop runs before it exits. 1548 if (LI->getLoopFor(ExitingBB) != L) 1549 return true; 1550 1551 // Can't rewrite non-branch yet. 1552 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1553 if (!BI) 1554 return true; 1555 1556 // Likewise, the loop latch must be dominated by the exiting BB. 1557 if (!DT->dominates(ExitingBB, L->getLoopLatch())) 1558 return true; 1559 1560 if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { 1561 // If already constant, nothing to do. However, if this is an 1562 // unconditional exit, we can still replace header phis with their 1563 // preheader value. 1564 if (!L->contains(BI->getSuccessor(CI->isNullValue()))) 1565 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1566 return true; 1567 } 1568 1569 return false; 1570 }); 1571 1572 if (ExitingBlocks.empty()) 1573 return false; 1574 1575 // Get a symbolic upper bound on the loop backedge taken count. 1576 const SCEV *MaxBECount = SE->getSymbolicMaxBackedgeTakenCount(L); 1577 if (isa<SCEVCouldNotCompute>(MaxBECount)) 1578 return false; 1579 1580 // Visit our exit blocks in order of dominance. We know from the fact that 1581 // all exits must dominate the latch, so there is a total dominance order 1582 // between them. 1583 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1584 // std::sort sorts in ascending order, so we want the inverse of 1585 // the normal dominance relation. 1586 if (A == B) return false; 1587 if (DT->properlyDominates(A, B)) 1588 return true; 1589 else { 1590 assert(DT->properlyDominates(B, A) && 1591 "expected total dominance order!"); 1592 return false; 1593 } 1594 }); 1595 #ifdef ASSERT 1596 for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 1597 assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 1598 } 1599 #endif 1600 1601 bool Changed = false; 1602 bool SkipLastIter = false; 1603 const SCEV *CurrMaxExit = SE->getCouldNotCompute(); 1604 auto UpdateSkipLastIter = [&](const SCEV *MaxExitCount) { 1605 if (SkipLastIter || isa<SCEVCouldNotCompute>(MaxExitCount)) 1606 return; 1607 if (isa<SCEVCouldNotCompute>(CurrMaxExit)) 1608 CurrMaxExit = MaxExitCount; 1609 else 1610 CurrMaxExit = SE->getUMinFromMismatchedTypes(CurrMaxExit, MaxExitCount); 1611 // If the loop has more than 1 iteration, all further checks will be 1612 // executed 1 iteration less. 1613 if (CurrMaxExit == MaxBECount) 1614 SkipLastIter = true; 1615 }; 1616 SmallSet<const SCEV *, 8> DominatingExactExitCounts; 1617 for (BasicBlock *ExitingBB : ExitingBlocks) { 1618 const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); 1619 const SCEV *MaxExitCount = SE->getExitCount( 1620 L, ExitingBB, ScalarEvolution::ExitCountKind::SymbolicMaximum); 1621 if (isa<SCEVCouldNotCompute>(ExactExitCount)) { 1622 // Okay, we do not know the exit count here. Can we at least prove that it 1623 // will remain the same within iteration space? 1624 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1625 auto OptimizeCond = [&](bool SkipLastIter) { 1626 return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB, 1627 MaxBECount, SkipLastIter, 1628 SE, Rewriter, DeadInsts); 1629 }; 1630 1631 // TODO: We might have proved that we can skip the last iteration for 1632 // this check. In this case, we only want to check the condition on the 1633 // pre-last iteration (MaxBECount - 1). However, there is a nasty 1634 // corner case: 1635 // 1636 // for (i = len; i != 0; i--) { ... check (i ult X) ... } 1637 // 1638 // If we could not prove that len != 0, then we also could not prove that 1639 // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then 1640 // OptimizeCond will likely not prove anything for it, even if it could 1641 // prove the same fact for len. 1642 // 1643 // As a temporary solution, we query both last and pre-last iterations in 1644 // hope that we will be able to prove triviality for at least one of 1645 // them. We can stop querying MaxBECount for this case once SCEV 1646 // understands that (MaxBECount - 1) will not overflow here. 1647 if (OptimizeCond(false)) 1648 Changed = true; 1649 else if (SkipLastIter && OptimizeCond(true)) 1650 Changed = true; 1651 UpdateSkipLastIter(MaxExitCount); 1652 continue; 1653 } 1654 1655 UpdateSkipLastIter(ExactExitCount); 1656 1657 // If we know we'd exit on the first iteration, rewrite the exit to 1658 // reflect this. This does not imply the loop must exit through this 1659 // exit; there may be an earlier one taken on the first iteration. 1660 // We know that the backedge can't be taken, so we replace all 1661 // the header PHIs with values coming from the preheader. 1662 if (ExactExitCount->isZero()) { 1663 foldExit(L, ExitingBB, true, DeadInsts); 1664 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1665 Changed = true; 1666 continue; 1667 } 1668 1669 assert(ExactExitCount->getType()->isIntegerTy() && 1670 MaxBECount->getType()->isIntegerTy() && 1671 "Exit counts must be integers"); 1672 1673 Type *WiderType = 1674 SE->getWiderType(MaxBECount->getType(), ExactExitCount->getType()); 1675 ExactExitCount = SE->getNoopOrZeroExtend(ExactExitCount, WiderType); 1676 MaxBECount = SE->getNoopOrZeroExtend(MaxBECount, WiderType); 1677 assert(MaxBECount->getType() == ExactExitCount->getType()); 1678 1679 // Can we prove that some other exit must be taken strictly before this 1680 // one? 1681 if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxBECount, 1682 ExactExitCount)) { 1683 foldExit(L, ExitingBB, false, DeadInsts); 1684 Changed = true; 1685 continue; 1686 } 1687 1688 // As we run, keep track of which exit counts we've encountered. If we 1689 // find a duplicate, we've found an exit which would have exited on the 1690 // exiting iteration, but (from the visit order) strictly follows another 1691 // which does the same and is thus dead. 1692 if (!DominatingExactExitCounts.insert(ExactExitCount).second) { 1693 foldExit(L, ExitingBB, false, DeadInsts); 1694 Changed = true; 1695 continue; 1696 } 1697 1698 // TODO: There might be another oppurtunity to leverage SCEV's reasoning 1699 // here. If we kept track of the min of dominanting exits so far, we could 1700 // discharge exits with EC >= MDEC. This is less powerful than the existing 1701 // transform (since later exits aren't considered), but potentially more 1702 // powerful for any case where SCEV can prove a >=u b, but neither a == b 1703 // or a >u b. Such a case is not currently known. 1704 } 1705 return Changed; 1706 } 1707 1708 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 1709 SmallVector<BasicBlock*, 16> ExitingBlocks; 1710 L->getExitingBlocks(ExitingBlocks); 1711 1712 // Finally, see if we can rewrite our exit conditions into a loop invariant 1713 // form. If we have a read-only loop, and we can tell that we must exit down 1714 // a path which does not need any of the values computed within the loop, we 1715 // can rewrite the loop to exit on the first iteration. Note that this 1716 // doesn't either a) tell us the loop exits on the first iteration (unless 1717 // *all* exits are predicateable) or b) tell us *which* exit might be taken. 1718 // This transformation looks a lot like a restricted form of dead loop 1719 // elimination, but restricted to read-only loops and without neccesssarily 1720 // needing to kill the loop entirely. 1721 if (!LoopPredication) 1722 return false; 1723 1724 // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 1725 // through *explicit* control flow. We have to eliminate the possibility of 1726 // implicit exits (see below) before we know it's truly exact. 1727 const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 1728 if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC)) 1729 return false; 1730 1731 assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); 1732 assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); 1733 1734 auto BadExit = [&](BasicBlock *ExitingBB) { 1735 // If our exiting block exits multiple loops, we can only rewrite the 1736 // innermost one. Otherwise, we're changing how many times the innermost 1737 // loop runs before it exits. 1738 if (LI->getLoopFor(ExitingBB) != L) 1739 return true; 1740 1741 // Can't rewrite non-branch yet. 1742 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1743 if (!BI) 1744 return true; 1745 1746 // If already constant, nothing to do. 1747 if (isa<Constant>(BI->getCondition())) 1748 return true; 1749 1750 // If the exit block has phis, we need to be able to compute the values 1751 // within the loop which contains them. This assumes trivially lcssa phis 1752 // have already been removed; TODO: generalize 1753 BasicBlock *ExitBlock = 1754 BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 1755 if (!ExitBlock->phis().empty()) 1756 return true; 1757 1758 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1759 if (isa<SCEVCouldNotCompute>(ExitCount) || 1760 !Rewriter.isSafeToExpand(ExitCount)) 1761 return true; 1762 1763 assert(SE->isLoopInvariant(ExitCount, L) && 1764 "Exit count must be loop invariant"); 1765 assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); 1766 return false; 1767 }; 1768 1769 // Make sure all exits dominate the latch. This means there is a linear chain 1770 // of exits. We check this before sorting so we have a total order. 1771 BasicBlock *Latch = L->getLoopLatch(); 1772 for (BasicBlock *ExitingBB : ExitingBlocks) 1773 if (!DT->dominates(ExitingBB, Latch)) 1774 return false; 1775 1776 // If we have any exits which can't be predicated themselves, than we can't 1777 // predicate any exit which isn't guaranteed to execute before it. Consider 1778 // two exits (a) and (b) which would both exit on the same iteration. If we 1779 // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 1780 // we could convert a loop from exiting through (a) to one exiting through 1781 // (b). Note that this problem exists only for exits with the same exit 1782 // count, and we could be more aggressive when exit counts are known inequal. 1783 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1784 // llvm::sort sorts in ascending order, so we want the inverse of 1785 // the normal dominance relation. 1786 if (A == B) 1787 return false; 1788 if (DT->properlyDominates(A, B)) 1789 return true; 1790 if (DT->properlyDominates(B, A)) 1791 return false; 1792 llvm_unreachable("Should have total dominance order"); 1793 }); 1794 1795 // Make sure our exit blocks are really a total order (i.e. a linear chain of 1796 // exits before the backedge). 1797 for (unsigned i = 1; i < ExitingBlocks.size(); i++) 1798 assert(DT->dominates(ExitingBlocks[i - 1], ExitingBlocks[i]) && 1799 "Not sorted by dominance"); 1800 1801 // Given our sorted total order, we know that exit[j] must be evaluated 1802 // after all exit[i] such j > i. 1803 for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 1804 if (BadExit(ExitingBlocks[i])) { 1805 ExitingBlocks.resize(i); 1806 break; 1807 } 1808 1809 if (ExitingBlocks.empty()) 1810 return false; 1811 1812 // At this point, ExitingBlocks consists of only those blocks which are 1813 // predicatable. Given that, we know we have at least one exit we can 1814 // predicate if the loop is doesn't have side effects and doesn't have any 1815 // implicit exits (because then our exact BTC isn't actually exact). 1816 // @Reviewers - As structured, this is O(I^2) for loop nests. Any 1817 // suggestions on how to improve this? I can obviously bail out for outer 1818 // loops, but that seems less than ideal. MemorySSA can find memory writes, 1819 // is that enough for *all* side effects? 1820 for (BasicBlock *BB : L->blocks()) 1821 for (auto &I : *BB) 1822 // TODO:isGuaranteedToTransfer 1823 if (I.mayHaveSideEffects()) 1824 return false; 1825 1826 bool Changed = false; 1827 // Finally, do the actual predication for all predicatable blocks. A couple 1828 // of notes here: 1829 // 1) We don't bother to constant fold dominated exits with identical exit 1830 // counts; that's simply a form of CSE/equality propagation and we leave 1831 // it for dedicated passes. 1832 // 2) We insert the comparison at the branch. Hoisting introduces additional 1833 // legality constraints and we leave that to dedicated logic. We want to 1834 // predicate even if we can't insert a loop invariant expression as 1835 // peeling or unrolling will likely reduce the cost of the otherwise loop 1836 // varying check. 1837 Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 1838 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 1839 Value *ExactBTCV = nullptr; // Lazily generated if needed. 1840 for (BasicBlock *ExitingBB : ExitingBlocks) { 1841 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1842 1843 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1844 Value *NewCond; 1845 if (ExitCount == ExactBTC) { 1846 NewCond = L->contains(BI->getSuccessor(0)) ? 1847 B.getFalse() : B.getTrue(); 1848 } else { 1849 Value *ECV = Rewriter.expandCodeFor(ExitCount); 1850 if (!ExactBTCV) 1851 ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 1852 Value *RHS = ExactBTCV; 1853 if (ECV->getType() != RHS->getType()) { 1854 Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 1855 ECV = B.CreateZExt(ECV, WiderTy); 1856 RHS = B.CreateZExt(RHS, WiderTy); 1857 } 1858 auto Pred = L->contains(BI->getSuccessor(0)) ? 1859 ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 1860 NewCond = B.CreateICmp(Pred, ECV, RHS); 1861 } 1862 Value *OldCond = BI->getCondition(); 1863 BI->setCondition(NewCond); 1864 if (OldCond->use_empty()) 1865 DeadInsts.emplace_back(OldCond); 1866 Changed = true; 1867 RunUnswitching = true; 1868 } 1869 1870 return Changed; 1871 } 1872 1873 //===----------------------------------------------------------------------===// 1874 // IndVarSimplify driver. Manage several subpasses of IV simplification. 1875 //===----------------------------------------------------------------------===// 1876 1877 bool IndVarSimplify::run(Loop *L) { 1878 // We need (and expect!) the incoming loop to be in LCSSA. 1879 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 1880 "LCSSA required to run indvars!"); 1881 1882 // If LoopSimplify form is not available, stay out of trouble. Some notes: 1883 // - LSR currently only supports LoopSimplify-form loops. Indvars' 1884 // canonicalization can be a pessimization without LSR to "clean up" 1885 // afterwards. 1886 // - We depend on having a preheader; in particular, 1887 // Loop::getCanonicalInductionVariable only supports loops with preheaders, 1888 // and we're in trouble if we can't find the induction variable even when 1889 // we've manually inserted one. 1890 // - LFTR relies on having a single backedge. 1891 if (!L->isLoopSimplifyForm()) 1892 return false; 1893 1894 bool Changed = false; 1895 // If there are any floating-point recurrences, attempt to 1896 // transform them to use integer recurrences. 1897 Changed |= rewriteNonIntegerIVs(L); 1898 1899 // Create a rewriter object which we'll use to transform the code with. 1900 SCEVExpander Rewriter(*SE, DL, "indvars"); 1901 #if LLVM_ENABLE_ABI_BREAKING_CHECKS 1902 Rewriter.setDebugType(DEBUG_TYPE); 1903 #endif 1904 1905 // Eliminate redundant IV users. 1906 // 1907 // Simplification works best when run before other consumers of SCEV. We 1908 // attempt to avoid evaluating SCEVs for sign/zero extend operations until 1909 // other expressions involving loop IVs have been evaluated. This helps SCEV 1910 // set no-wrap flags before normalizing sign/zero extension. 1911 Rewriter.disableCanonicalMode(); 1912 Changed |= simplifyAndExtend(L, Rewriter, LI); 1913 1914 // Check to see if we can compute the final value of any expressions 1915 // that are recurrent in the loop, and substitute the exit values from the 1916 // loop into any instructions outside of the loop that use the final values 1917 // of the current expressions. 1918 if (ReplaceExitValue != NeverRepl) { 1919 if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 1920 ReplaceExitValue, DeadInsts)) { 1921 NumReplaced += Rewrites; 1922 Changed = true; 1923 } 1924 } 1925 1926 // Eliminate redundant IV cycles. 1927 NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); 1928 1929 // Try to convert exit conditions to unsigned and rotate computation 1930 // out of the loop. Note: Handles invalidation internally if needed. 1931 Changed |= canonicalizeExitCondition(L); 1932 1933 // Try to eliminate loop exits based on analyzeable exit counts 1934 if (optimizeLoopExits(L, Rewriter)) { 1935 Changed = true; 1936 // Given we've changed exit counts, notify SCEV 1937 // Some nested loops may share same folded exit basic block, 1938 // thus we need to notify top most loop. 1939 SE->forgetTopmostLoop(L); 1940 } 1941 1942 // Try to form loop invariant tests for loop exits by changing how many 1943 // iterations of the loop run when that is unobservable. 1944 if (predicateLoopExits(L, Rewriter)) { 1945 Changed = true; 1946 // Given we've changed exit counts, notify SCEV 1947 SE->forgetLoop(L); 1948 } 1949 1950 // If we have a trip count expression, rewrite the loop's exit condition 1951 // using it. 1952 if (!DisableLFTR) { 1953 BasicBlock *PreHeader = L->getLoopPreheader(); 1954 1955 SmallVector<BasicBlock*, 16> ExitingBlocks; 1956 L->getExitingBlocks(ExitingBlocks); 1957 for (BasicBlock *ExitingBB : ExitingBlocks) { 1958 // Can't rewrite non-branch yet. 1959 if (!isa<BranchInst>(ExitingBB->getTerminator())) 1960 continue; 1961 1962 // If our exitting block exits multiple loops, we can only rewrite the 1963 // innermost one. Otherwise, we're changing how many times the innermost 1964 // loop runs before it exits. 1965 if (LI->getLoopFor(ExitingBB) != L) 1966 continue; 1967 1968 if (!needsLFTR(L, ExitingBB)) 1969 continue; 1970 1971 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1972 if (isa<SCEVCouldNotCompute>(ExitCount)) 1973 continue; 1974 1975 // This was handled above, but as we form SCEVs, we can sometimes refine 1976 // existing ones; this allows exit counts to be folded to zero which 1977 // weren't when optimizeLoopExits saw them. Arguably, we should iterate 1978 // until stable to handle cases like this better. 1979 if (ExitCount->isZero()) 1980 continue; 1981 1982 PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 1983 if (!IndVar) 1984 continue; 1985 1986 // Avoid high cost expansions. Note: This heuristic is questionable in 1987 // that our definition of "high cost" is not exactly principled. 1988 if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 1989 TTI, PreHeader->getTerminator())) 1990 continue; 1991 1992 if (!Rewriter.isSafeToExpand(ExitCount)) 1993 continue; 1994 1995 Changed |= linearFunctionTestReplace(L, ExitingBB, 1996 ExitCount, IndVar, 1997 Rewriter); 1998 } 1999 } 2000 // Clear the rewriter cache, because values that are in the rewriter's cache 2001 // can be deleted in the loop below, causing the AssertingVH in the cache to 2002 // trigger. 2003 Rewriter.clear(); 2004 2005 // Now that we're done iterating through lists, clean up any instructions 2006 // which are now dead. 2007 while (!DeadInsts.empty()) { 2008 Value *V = DeadInsts.pop_back_val(); 2009 2010 if (PHINode *PHI = dyn_cast_or_null<PHINode>(V)) 2011 Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get()); 2012 else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 2013 Changed |= 2014 RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 2015 } 2016 2017 // The Rewriter may not be used from this point on. 2018 2019 // Loop-invariant instructions in the preheader that aren't used in the 2020 // loop may be sunk below the loop to reduce register pressure. 2021 Changed |= sinkUnusedInvariants(L); 2022 2023 // rewriteFirstIterationLoopExitValues does not rely on the computation of 2024 // trip count and therefore can further simplify exit values in addition to 2025 // rewriteLoopExitValues. 2026 Changed |= rewriteFirstIterationLoopExitValues(L); 2027 2028 // Clean up dead instructions. 2029 Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 2030 2031 // Check a post-condition. 2032 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 2033 "Indvars did not preserve LCSSA!"); 2034 if (VerifyMemorySSA && MSSAU) 2035 MSSAU->getMemorySSA()->verifyMemorySSA(); 2036 2037 return Changed; 2038 } 2039 2040 PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 2041 LoopStandardAnalysisResults &AR, 2042 LPMUpdater &) { 2043 Function *F = L.getHeader()->getParent(); 2044 const DataLayout &DL = F->getDataLayout(); 2045 2046 IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, 2047 WidenIndVars && AllowIVWidening); 2048 if (!IVS.run(&L)) 2049 return PreservedAnalyses::all(); 2050 2051 auto PA = getLoopPassPreservedAnalyses(); 2052 PA.preserveSet<CFGAnalyses>(); 2053 if (IVS.runUnswitching()) { 2054 AM.getResult<ShouldRunExtraSimpleLoopUnswitch>(L, AR); 2055 PA.preserve<ShouldRunExtraSimpleLoopUnswitch>(); 2056 } 2057 2058 if (AR.MSSA) 2059 PA.preserve<MemorySSAAnalysis>(); 2060 return PA; 2061 } 2062