1 //===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This transformation analyzes and transforms the induction variables (and 10 // computations derived from them) into simpler forms suitable for subsequent 11 // analysis and transformation. 12 // 13 // If the trip count of a loop is computable, this pass also makes the following 14 // changes: 15 // 1. The exit condition for the loop is canonicalized to compare the 16 // induction value against the exit value. This turns loops like: 17 // 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 18 // 2. Any use outside of the loop of an expression derived from the indvar 19 // is changed to compute the derived value outside of the loop, eliminating 20 // the dependence on the exit value of the induction variable. If the only 21 // purpose of the loop is to compute the exit value of some derived 22 // expression, this transformation will make the loop dead. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/LoopInfo.h" 36 #include "llvm/Analysis/LoopPass.h" 37 #include "llvm/Analysis/MemorySSA.h" 38 #include "llvm/Analysis/MemorySSAUpdater.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/TargetLibraryInfo.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/Analysis/ValueTracking.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constant.h" 46 #include "llvm/IR/ConstantRange.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DataLayout.h" 49 #include "llvm/IR/DerivedTypes.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/IRBuilder.h" 53 #include "llvm/IR/InstrTypes.h" 54 #include "llvm/IR/Instruction.h" 55 #include "llvm/IR/Instructions.h" 56 #include "llvm/IR/IntrinsicInst.h" 57 #include "llvm/IR/Intrinsics.h" 58 #include "llvm/IR/Module.h" 59 #include "llvm/IR/Operator.h" 60 #include "llvm/IR/PassManager.h" 61 #include "llvm/IR/PatternMatch.h" 62 #include "llvm/IR/Type.h" 63 #include "llvm/IR/Use.h" 64 #include "llvm/IR/User.h" 65 #include "llvm/IR/Value.h" 66 #include "llvm/IR/ValueHandle.h" 67 #include "llvm/Support/Casting.h" 68 #include "llvm/Support/CommandLine.h" 69 #include "llvm/Support/Compiler.h" 70 #include "llvm/Support/Debug.h" 71 #include "llvm/Support/MathExtras.h" 72 #include "llvm/Support/raw_ostream.h" 73 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 74 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 75 #include "llvm/Transforms/Utils/Local.h" 76 #include "llvm/Transforms/Utils/LoopUtils.h" 77 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 78 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 79 #include <cassert> 80 #include <cstdint> 81 #include <utility> 82 83 using namespace llvm; 84 using namespace PatternMatch; 85 86 #define DEBUG_TYPE "indvars" 87 88 STATISTIC(NumWidened , "Number of indvars widened"); 89 STATISTIC(NumReplaced , "Number of exit values replaced"); 90 STATISTIC(NumLFTR , "Number of loop exit tests replaced"); 91 STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); 92 STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); 93 94 static cl::opt<ReplaceExitVal> ReplaceExitValue( 95 "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 96 cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 97 cl::values( 98 clEnumValN(NeverRepl, "never", "never replace exit value"), 99 clEnumValN(OnlyCheapRepl, "cheap", 100 "only replace exit value when the cost is cheap"), 101 clEnumValN( 102 UnusedIndVarInLoop, "unusedindvarinloop", 103 "only replace exit value when it is an unused " 104 "induction variable in the loop and has cheap replacement cost"), 105 clEnumValN(NoHardUse, "noharduse", 106 "only replace exit values when loop def likely dead"), 107 clEnumValN(AlwaysRepl, "always", 108 "always replace exit value whenever possible"))); 109 110 static cl::opt<bool> UsePostIncrementRanges( 111 "indvars-post-increment-ranges", cl::Hidden, 112 cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 113 cl::init(true)); 114 115 static cl::opt<bool> 116 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 117 cl::desc("Disable Linear Function Test Replace optimization")); 118 119 static cl::opt<bool> 120 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 121 cl::desc("Predicate conditions in read only loops")); 122 123 static cl::opt<bool> 124 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), 125 cl::desc("Allow widening of indvars to eliminate s/zext")); 126 127 namespace { 128 129 class IndVarSimplify { 130 LoopInfo *LI; 131 ScalarEvolution *SE; 132 DominatorTree *DT; 133 const DataLayout &DL; 134 TargetLibraryInfo *TLI; 135 const TargetTransformInfo *TTI; 136 std::unique_ptr<MemorySSAUpdater> MSSAU; 137 138 SmallVector<WeakTrackingVH, 16> DeadInsts; 139 bool WidenIndVars; 140 141 bool RunUnswitching = false; 142 143 bool handleFloatingPointIV(Loop *L, PHINode *PH); 144 bool rewriteNonIntegerIVs(Loop *L); 145 146 bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 147 /// Try to improve our exit conditions by converting condition from signed 148 /// to unsigned or rotating computation out of the loop. 149 /// (See inline comment about why this is duplicated from simplifyAndExtend) 150 bool canonicalizeExitCondition(Loop *L); 151 /// Try to eliminate loop exits based on analyzeable exit counts 152 bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 153 /// Try to form loop invariant tests for loop exits by changing how many 154 /// iterations of the loop run when that is unobservable. 155 bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 156 157 bool rewriteFirstIterationLoopExitValues(Loop *L); 158 159 bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 160 const SCEV *ExitCount, 161 PHINode *IndVar, SCEVExpander &Rewriter); 162 163 bool sinkUnusedInvariants(Loop *L); 164 165 public: 166 IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 167 const DataLayout &DL, TargetLibraryInfo *TLI, 168 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) 169 : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), 170 WidenIndVars(WidenIndVars) { 171 if (MSSA) 172 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 173 } 174 175 bool run(Loop *L); 176 177 bool runUnswitching() const { return RunUnswitching; } 178 }; 179 180 } // end anonymous namespace 181 182 //===----------------------------------------------------------------------===// 183 // rewriteNonIntegerIVs and helpers. Prefer integer IVs. 184 //===----------------------------------------------------------------------===// 185 186 /// Convert APF to an integer, if possible. 187 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 188 bool isExact = false; 189 // See if we can convert this to an int64_t 190 uint64_t UIntVal; 191 if (APF.convertToInteger(MutableArrayRef(UIntVal), 64, true, 192 APFloat::rmTowardZero, &isExact) != APFloat::opOK || 193 !isExact) 194 return false; 195 IntVal = UIntVal; 196 return true; 197 } 198 199 /// If the loop has floating induction variable then insert corresponding 200 /// integer induction variable if possible. 201 /// For example, 202 /// for(double i = 0; i < 10000; ++i) 203 /// bar(i) 204 /// is converted into 205 /// for(int i = 0; i < 10000; ++i) 206 /// bar((double)i); 207 bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 208 unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 209 unsigned BackEdge = IncomingEdge^1; 210 211 // Check incoming value. 212 auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 213 214 int64_t InitValue; 215 if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 216 return false; 217 218 // Check IV increment. Reject this PN if increment operation is not 219 // an add or increment value can not be represented by an integer. 220 auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 221 if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 222 223 // If this is not an add of the PHI with a constantfp, or if the constant fp 224 // is not an integer, bail out. 225 ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 226 int64_t IncValue; 227 if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 228 !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 229 return false; 230 231 // Check Incr uses. One user is PN and the other user is an exit condition 232 // used by the conditional terminator. 233 Value::user_iterator IncrUse = Incr->user_begin(); 234 Instruction *U1 = cast<Instruction>(*IncrUse++); 235 if (IncrUse == Incr->user_end()) return false; 236 Instruction *U2 = cast<Instruction>(*IncrUse++); 237 if (IncrUse != Incr->user_end()) return false; 238 239 // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't 240 // only used by a branch, we can't transform it. 241 FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 242 if (!Compare) 243 Compare = dyn_cast<FCmpInst>(U2); 244 if (!Compare || !Compare->hasOneUse() || 245 !isa<BranchInst>(Compare->user_back())) 246 return false; 247 248 BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 249 250 // We need to verify that the branch actually controls the iteration count 251 // of the loop. If not, the new IV can overflow and no one will notice. 252 // The branch block must be in the loop and one of the successors must be out 253 // of the loop. 254 assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 255 if (!L->contains(TheBr->getParent()) || 256 (L->contains(TheBr->getSuccessor(0)) && 257 L->contains(TheBr->getSuccessor(1)))) 258 return false; 259 260 // If it isn't a comparison with an integer-as-fp (the exit value), we can't 261 // transform it. 262 ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 263 int64_t ExitValue; 264 if (ExitValueVal == nullptr || 265 !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 266 return false; 267 268 // Find new predicate for integer comparison. 269 CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 270 switch (Compare->getPredicate()) { 271 default: return false; // Unknown comparison. 272 case CmpInst::FCMP_OEQ: 273 case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 274 case CmpInst::FCMP_ONE: 275 case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 276 case CmpInst::FCMP_OGT: 277 case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 278 case CmpInst::FCMP_OGE: 279 case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 280 case CmpInst::FCMP_OLT: 281 case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 282 case CmpInst::FCMP_OLE: 283 case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 284 } 285 286 // We convert the floating point induction variable to a signed i32 value if 287 // we can. This is only safe if the comparison will not overflow in a way 288 // that won't be trapped by the integer equivalent operations. Check for this 289 // now. 290 // TODO: We could use i64 if it is native and the range requires it. 291 292 // The start/stride/exit values must all fit in signed i32. 293 if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 294 return false; 295 296 // If not actually striding (add x, 0.0), avoid touching the code. 297 if (IncValue == 0) 298 return false; 299 300 // Positive and negative strides have different safety conditions. 301 if (IncValue > 0) { 302 // If we have a positive stride, we require the init to be less than the 303 // exit value. 304 if (InitValue >= ExitValue) 305 return false; 306 307 uint32_t Range = uint32_t(ExitValue-InitValue); 308 // Check for infinite loop, either: 309 // while (i <= Exit) or until (i > Exit) 310 if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 311 if (++Range == 0) return false; // Range overflows. 312 } 313 314 unsigned Leftover = Range % uint32_t(IncValue); 315 316 // If this is an equality comparison, we require that the strided value 317 // exactly land on the exit value, otherwise the IV condition will wrap 318 // around and do things the fp IV wouldn't. 319 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 320 Leftover != 0) 321 return false; 322 323 // If the stride would wrap around the i32 before exiting, we can't 324 // transform the IV. 325 if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 326 return false; 327 } else { 328 // If we have a negative stride, we require the init to be greater than the 329 // exit value. 330 if (InitValue <= ExitValue) 331 return false; 332 333 uint32_t Range = uint32_t(InitValue-ExitValue); 334 // Check for infinite loop, either: 335 // while (i >= Exit) or until (i < Exit) 336 if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 337 if (++Range == 0) return false; // Range overflows. 338 } 339 340 unsigned Leftover = Range % uint32_t(-IncValue); 341 342 // If this is an equality comparison, we require that the strided value 343 // exactly land on the exit value, otherwise the IV condition will wrap 344 // around and do things the fp IV wouldn't. 345 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 346 Leftover != 0) 347 return false; 348 349 // If the stride would wrap around the i32 before exiting, we can't 350 // transform the IV. 351 if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 352 return false; 353 } 354 355 IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 356 357 // Insert new integer induction variable. 358 PHINode *NewPHI = 359 PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator()); 360 NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), 361 PN->getIncomingBlock(IncomingEdge)); 362 NewPHI->setDebugLoc(PN->getDebugLoc()); 363 364 Instruction *NewAdd = 365 BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), 366 Incr->getName() + ".int", Incr->getIterator()); 367 NewAdd->setDebugLoc(Incr->getDebugLoc()); 368 NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 369 370 ICmpInst *NewCompare = 371 new ICmpInst(TheBr->getIterator(), NewPred, NewAdd, 372 ConstantInt::get(Int32Ty, ExitValue), Compare->getName()); 373 NewCompare->setDebugLoc(Compare->getDebugLoc()); 374 375 // In the following deletions, PN may become dead and may be deleted. 376 // Use a WeakTrackingVH to observe whether this happens. 377 WeakTrackingVH WeakPH = PN; 378 379 // Delete the old floating point exit comparison. The branch starts using the 380 // new comparison. 381 NewCompare->takeName(Compare); 382 Compare->replaceAllUsesWith(NewCompare); 383 RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 384 385 // Delete the old floating point increment. 386 Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); 387 RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 388 389 // If the FP induction variable still has uses, this is because something else 390 // in the loop uses its value. In order to canonicalize the induction 391 // variable, we chose to eliminate the IV and rewrite it in terms of an 392 // int->fp cast. 393 // 394 // We give preference to sitofp over uitofp because it is faster on most 395 // platforms. 396 if (WeakPH) { 397 Instruction *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 398 PN->getParent()->getFirstInsertionPt()); 399 Conv->setDebugLoc(PN->getDebugLoc()); 400 PN->replaceAllUsesWith(Conv); 401 RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 402 } 403 return true; 404 } 405 406 bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 407 // First step. Check to see if there are any floating-point recurrences. 408 // If there are, change them into integer recurrences, permitting analysis by 409 // the SCEV routines. 410 BasicBlock *Header = L->getHeader(); 411 412 SmallVector<WeakTrackingVH, 8> PHIs; 413 for (PHINode &PN : Header->phis()) 414 PHIs.push_back(&PN); 415 416 bool Changed = false; 417 for (WeakTrackingVH &PHI : PHIs) 418 if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHI)) 419 Changed |= handleFloatingPointIV(L, PN); 420 421 // If the loop previously had floating-point IV, ScalarEvolution 422 // may not have been able to compute a trip count. Now that we've done some 423 // re-writing, the trip count may be computable. 424 if (Changed) 425 SE->forgetLoop(L); 426 return Changed; 427 } 428 429 //===---------------------------------------------------------------------===// 430 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 431 // they will exit at the first iteration. 432 //===---------------------------------------------------------------------===// 433 434 /// Check to see if this loop has loop invariant conditions which lead to loop 435 /// exits. If so, we know that if the exit path is taken, it is at the first 436 /// loop iteration. This lets us predict exit values of PHI nodes that live in 437 /// loop header. 438 bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 439 // Verify the input to the pass is already in LCSSA form. 440 assert(L->isLCSSAForm(*DT)); 441 442 SmallVector<BasicBlock *, 8> ExitBlocks; 443 L->getUniqueExitBlocks(ExitBlocks); 444 445 bool MadeAnyChanges = false; 446 for (auto *ExitBB : ExitBlocks) { 447 // If there are no more PHI nodes in this exit block, then no more 448 // values defined inside the loop are used on this path. 449 for (PHINode &PN : ExitBB->phis()) { 450 for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 451 IncomingValIdx != E; ++IncomingValIdx) { 452 auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 453 454 // Can we prove that the exit must run on the first iteration if it 455 // runs at all? (i.e. early exits are fine for our purposes, but 456 // traces which lead to this exit being taken on the 2nd iteration 457 // aren't.) Note that this is about whether the exit branch is 458 // executed, not about whether it is taken. 459 if (!L->getLoopLatch() || 460 !DT->dominates(IncomingBB, L->getLoopLatch())) 461 continue; 462 463 // Get condition that leads to the exit path. 464 auto *TermInst = IncomingBB->getTerminator(); 465 466 Value *Cond = nullptr; 467 if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 468 // Must be a conditional branch, otherwise the block 469 // should not be in the loop. 470 Cond = BI->getCondition(); 471 } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 472 Cond = SI->getCondition(); 473 else 474 continue; 475 476 if (!L->isLoopInvariant(Cond)) 477 continue; 478 479 auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 480 481 // Only deal with PHIs in the loop header. 482 if (!ExitVal || ExitVal->getParent() != L->getHeader()) 483 continue; 484 485 // If ExitVal is a PHI on the loop header, then we know its 486 // value along this exit because the exit can only be taken 487 // on the first iteration. 488 auto *LoopPreheader = L->getLoopPreheader(); 489 assert(LoopPreheader && "Invalid loop"); 490 int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 491 if (PreheaderIdx != -1) { 492 assert(ExitVal->getParent() == L->getHeader() && 493 "ExitVal must be in loop header"); 494 MadeAnyChanges = true; 495 PN.setIncomingValue(IncomingValIdx, 496 ExitVal->getIncomingValue(PreheaderIdx)); 497 SE->forgetValue(&PN); 498 } 499 } 500 } 501 } 502 return MadeAnyChanges; 503 } 504 505 //===----------------------------------------------------------------------===// 506 // IV Widening - Extend the width of an IV to cover its widest uses. 507 //===----------------------------------------------------------------------===// 508 509 /// Update information about the induction variable that is extended by this 510 /// sign or zero extend operation. This is used to determine the final width of 511 /// the IV before actually widening it. 512 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, 513 ScalarEvolution *SE, 514 const TargetTransformInfo *TTI) { 515 bool IsSigned = Cast->getOpcode() == Instruction::SExt; 516 if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 517 return; 518 519 Type *Ty = Cast->getType(); 520 uint64_t Width = SE->getTypeSizeInBits(Ty); 521 if (!Cast->getDataLayout().isLegalInteger(Width)) 522 return; 523 524 // Check that `Cast` actually extends the induction variable (we rely on this 525 // later). This takes care of cases where `Cast` is extending a truncation of 526 // the narrow induction variable, and thus can end up being narrower than the 527 // "narrow" induction variable. 528 uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 529 if (NarrowIVWidth >= Width) 530 return; 531 532 // Cast is either an sext or zext up to this point. 533 // We should not widen an indvar if arithmetics on the wider indvar are more 534 // expensive than those on the narrower indvar. We check only the cost of ADD 535 // because at least an ADD is required to increment the induction variable. We 536 // could compute more comprehensively the cost of all instructions on the 537 // induction variable when necessary. 538 if (TTI && 539 TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 540 TTI->getArithmeticInstrCost(Instruction::Add, 541 Cast->getOperand(0)->getType())) { 542 return; 543 } 544 545 if (!WI.WidestNativeType || 546 Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { 547 WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 548 WI.IsSigned = IsSigned; 549 return; 550 } 551 552 // We extend the IV to satisfy the sign of its user(s), or 'signed' 553 // if there are multiple users with both sign- and zero extensions, 554 // in order not to introduce nondeterministic behaviour based on the 555 // unspecified order of a PHI nodes' users-iterator. 556 WI.IsSigned |= IsSigned; 557 } 558 559 //===----------------------------------------------------------------------===// 560 // Live IV Reduction - Minimize IVs live across the loop. 561 //===----------------------------------------------------------------------===// 562 563 //===----------------------------------------------------------------------===// 564 // Simplification of IV users based on SCEV evaluation. 565 //===----------------------------------------------------------------------===// 566 567 namespace { 568 569 class IndVarSimplifyVisitor : public IVVisitor { 570 ScalarEvolution *SE; 571 const TargetTransformInfo *TTI; 572 PHINode *IVPhi; 573 574 public: 575 WideIVInfo WI; 576 577 IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 578 const TargetTransformInfo *TTI, 579 const DominatorTree *DTree) 580 : SE(SCEV), TTI(TTI), IVPhi(IV) { 581 DT = DTree; 582 WI.NarrowIV = IVPhi; 583 } 584 585 // Implement the interface used by simplifyUsersOfIV. 586 void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 587 }; 588 589 } // end anonymous namespace 590 591 /// Iteratively perform simplification on a worklist of IV users. Each 592 /// successive simplification may push more users which may themselves be 593 /// candidates for simplification. 594 /// 595 /// Sign/Zero extend elimination is interleaved with IV simplification. 596 bool IndVarSimplify::simplifyAndExtend(Loop *L, 597 SCEVExpander &Rewriter, 598 LoopInfo *LI) { 599 SmallVector<WideIVInfo, 8> WideIVs; 600 601 auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( 602 Intrinsic::getName(Intrinsic::experimental_guard)); 603 bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 604 605 SmallVector<PHINode *, 8> LoopPhis; 606 for (PHINode &PN : L->getHeader()->phis()) 607 LoopPhis.push_back(&PN); 608 609 // Each round of simplification iterates through the SimplifyIVUsers worklist 610 // for all current phis, then determines whether any IVs can be 611 // widened. Widening adds new phis to LoopPhis, inducing another round of 612 // simplification on the wide IVs. 613 bool Changed = false; 614 while (!LoopPhis.empty()) { 615 // Evaluate as many IV expressions as possible before widening any IVs. This 616 // forces SCEV to set no-wrap flags before evaluating sign/zero 617 // extension. The first time SCEV attempts to normalize sign/zero extension, 618 // the result becomes final. So for the most predictable results, we delay 619 // evaluation of sign/zero extend evaluation until needed, and avoid running 620 // other SCEV based analysis prior to simplifyAndExtend. 621 do { 622 PHINode *CurrIV = LoopPhis.pop_back_val(); 623 624 // Information about sign/zero extensions of CurrIV. 625 IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 626 627 const auto &[C, U] = simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, 628 Rewriter, &Visitor); 629 630 Changed |= C; 631 RunUnswitching |= U; 632 if (Visitor.WI.WidestNativeType) { 633 WideIVs.push_back(Visitor.WI); 634 } 635 } while(!LoopPhis.empty()); 636 637 // Continue if we disallowed widening. 638 if (!WidenIndVars) 639 continue; 640 641 for (; !WideIVs.empty(); WideIVs.pop_back()) { 642 unsigned ElimExt; 643 unsigned Widened; 644 if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter, 645 DT, DeadInsts, ElimExt, Widened, 646 HasGuards, UsePostIncrementRanges)) { 647 NumElimExt += ElimExt; 648 NumWidened += Widened; 649 Changed = true; 650 LoopPhis.push_back(WidePhi); 651 } 652 } 653 } 654 return Changed; 655 } 656 657 //===----------------------------------------------------------------------===// 658 // linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 659 //===----------------------------------------------------------------------===// 660 661 /// Given an Value which is hoped to be part of an add recurance in the given 662 /// loop, return the associated Phi node if so. Otherwise, return null. Note 663 /// that this is less general than SCEVs AddRec checking. 664 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 665 Instruction *IncI = dyn_cast<Instruction>(IncV); 666 if (!IncI) 667 return nullptr; 668 669 switch (IncI->getOpcode()) { 670 case Instruction::Add: 671 case Instruction::Sub: 672 break; 673 case Instruction::GetElementPtr: 674 // An IV counter must preserve its type. 675 if (IncI->getNumOperands() == 2) 676 break; 677 [[fallthrough]]; 678 default: 679 return nullptr; 680 } 681 682 PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 683 if (Phi && Phi->getParent() == L->getHeader()) { 684 if (L->isLoopInvariant(IncI->getOperand(1))) 685 return Phi; 686 return nullptr; 687 } 688 if (IncI->getOpcode() == Instruction::GetElementPtr) 689 return nullptr; 690 691 // Allow add/sub to be commuted. 692 Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 693 if (Phi && Phi->getParent() == L->getHeader()) { 694 if (L->isLoopInvariant(IncI->getOperand(0))) 695 return Phi; 696 } 697 return nullptr; 698 } 699 700 /// Whether the current loop exit test is based on this value. Currently this 701 /// is limited to a direct use in the loop condition. 702 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 703 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 704 ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 705 // TODO: Allow non-icmp loop test. 706 if (!ICmp) 707 return false; 708 709 // TODO: Allow indirect use. 710 return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 711 } 712 713 /// linearFunctionTestReplace policy. Return true unless we can show that the 714 /// current exit test is already sufficiently canonical. 715 static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 716 assert(L->getLoopLatch() && "Must be in simplified form"); 717 718 // Avoid converting a constant or loop invariant test back to a runtime 719 // test. This is critical for when SCEV's cached ExitCount is less precise 720 // than the current IR (such as after we've proven a particular exit is 721 // actually dead and thus the BE count never reaches our ExitCount.) 722 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 723 if (L->isLoopInvariant(BI->getCondition())) 724 return false; 725 726 // Do LFTR to simplify the exit condition to an ICMP. 727 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 728 if (!Cond) 729 return true; 730 731 // Do LFTR to simplify the exit ICMP to EQ/NE 732 ICmpInst::Predicate Pred = Cond->getPredicate(); 733 if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 734 return true; 735 736 // Look for a loop invariant RHS 737 Value *LHS = Cond->getOperand(0); 738 Value *RHS = Cond->getOperand(1); 739 if (!L->isLoopInvariant(RHS)) { 740 if (!L->isLoopInvariant(LHS)) 741 return true; 742 std::swap(LHS, RHS); 743 } 744 // Look for a simple IV counter LHS 745 PHINode *Phi = dyn_cast<PHINode>(LHS); 746 if (!Phi) 747 Phi = getLoopPhiForCounter(LHS, L); 748 749 if (!Phi) 750 return true; 751 752 // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 753 int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 754 if (Idx < 0) 755 return true; 756 757 // Do LFTR if the exit condition's IV is *not* a simple counter. 758 Value *IncV = Phi->getIncomingValue(Idx); 759 return Phi != getLoopPhiForCounter(IncV, L); 760 } 761 762 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 763 /// down to checking that all operands are constant and listing instructions 764 /// that may hide undef. 765 static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 766 unsigned Depth) { 767 if (isa<Constant>(V)) 768 return !isa<UndefValue>(V); 769 770 if (Depth >= 6) 771 return false; 772 773 // Conservatively handle non-constant non-instructions. For example, Arguments 774 // may be undef. 775 Instruction *I = dyn_cast<Instruction>(V); 776 if (!I) 777 return false; 778 779 // Load and return values may be undef. 780 if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 781 return false; 782 783 // Optimistically handle other instructions. 784 for (Value *Op : I->operands()) { 785 if (!Visited.insert(Op).second) 786 continue; 787 if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 788 return false; 789 } 790 return true; 791 } 792 793 /// Return true if the given value is concrete. We must prove that undef can 794 /// never reach it. 795 /// 796 /// TODO: If we decide that this is a good approach to checking for undef, we 797 /// may factor it into a common location. 798 static bool hasConcreteDef(Value *V) { 799 SmallPtrSet<Value*, 8> Visited; 800 Visited.insert(V); 801 return hasConcreteDefImpl(V, Visited, 0); 802 } 803 804 /// Return true if the given phi is a "counter" in L. A counter is an 805 /// add recurance (of integer or pointer type) with an arbitrary start, and a 806 /// step of 1. Note that L must have exactly one latch. 807 static bool isLoopCounter(PHINode* Phi, Loop *L, 808 ScalarEvolution *SE) { 809 assert(Phi->getParent() == L->getHeader()); 810 assert(L->getLoopLatch()); 811 812 if (!SE->isSCEVable(Phi->getType())) 813 return false; 814 815 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 816 if (!AR || AR->getLoop() != L || !AR->isAffine()) 817 return false; 818 819 const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 820 if (!Step || !Step->isOne()) 821 return false; 822 823 int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 824 Value *IncV = Phi->getIncomingValue(LatchIdx); 825 return (getLoopPhiForCounter(IncV, L) == Phi && 826 isa<SCEVAddRecExpr>(SE->getSCEV(IncV))); 827 } 828 829 /// Search the loop header for a loop counter (anadd rec w/step of one) 830 /// suitable for use by LFTR. If multiple counters are available, select the 831 /// "best" one based profitable heuristics. 832 /// 833 /// BECount may be an i8* pointer type. The pointer difference is already 834 /// valid count without scaling the address stride, so it remains a pointer 835 /// expression as far as SCEV is concerned. 836 static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 837 const SCEV *BECount, 838 ScalarEvolution *SE, DominatorTree *DT) { 839 uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 840 841 Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 842 843 // Loop over all of the PHI nodes, looking for a simple counter. 844 PHINode *BestPhi = nullptr; 845 const SCEV *BestInit = nullptr; 846 BasicBlock *LatchBlock = L->getLoopLatch(); 847 assert(LatchBlock && "Must be in simplified form"); 848 const DataLayout &DL = L->getHeader()->getDataLayout(); 849 850 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 851 PHINode *Phi = cast<PHINode>(I); 852 if (!isLoopCounter(Phi, L, SE)) 853 continue; 854 855 const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 856 857 // AR may be a pointer type, while BECount is an integer type. 858 // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 859 // AR may not be a narrower type, or we may never exit. 860 uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 861 if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 862 continue; 863 864 // Avoid reusing a potentially undef value to compute other values that may 865 // have originally had a concrete definition. 866 if (!hasConcreteDef(Phi)) { 867 // We explicitly allow unknown phis as long as they are already used by 868 // the loop exit test. This is legal since performing LFTR could not 869 // increase the number of undef users. 870 Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 871 if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 872 !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 873 continue; 874 } 875 876 // Avoid introducing undefined behavior due to poison which didn't exist in 877 // the original program. (Annoyingly, the rules for poison and undef 878 // propagation are distinct, so this does NOT cover the undef case above.) 879 // We have to ensure that we don't introduce UB by introducing a use on an 880 // iteration where said IV produces poison. Our strategy here differs for 881 // pointers and integer IVs. For integers, we strip and reinfer as needed, 882 // see code in linearFunctionTestReplace. For pointers, we restrict 883 // transforms as there is no good way to reinfer inbounds once lost. 884 if (!Phi->getType()->isIntegerTy() && 885 !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 886 continue; 887 888 const SCEV *Init = AR->getStart(); 889 890 if (BestPhi && !isAlmostDeadIV(BestPhi, LatchBlock, Cond)) { 891 // Don't force a live loop counter if another IV can be used. 892 if (isAlmostDeadIV(Phi, LatchBlock, Cond)) 893 continue; 894 895 // Prefer to count-from-zero. This is a more "canonical" counter form. It 896 // also prefers integer to pointer IVs. 897 if (BestInit->isZero() != Init->isZero()) { 898 if (BestInit->isZero()) 899 continue; 900 } 901 // If two IVs both count from zero or both count from nonzero then the 902 // narrower is likely a dead phi that has been widened. Use the wider phi 903 // to allow the other to be eliminated. 904 else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 905 continue; 906 } 907 BestPhi = Phi; 908 BestInit = Init; 909 } 910 return BestPhi; 911 } 912 913 /// Insert an IR expression which computes the value held by the IV IndVar 914 /// (which must be an loop counter w/unit stride) after the backedge of loop L 915 /// is taken ExitCount times. 916 static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 917 const SCEV *ExitCount, bool UsePostInc, Loop *L, 918 SCEVExpander &Rewriter, ScalarEvolution *SE) { 919 assert(isLoopCounter(IndVar, L, SE)); 920 assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer"); 921 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 922 assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 923 924 // For integer IVs, truncate the IV before computing the limit unless we 925 // know apriori that the limit must be a constant when evaluated in the 926 // bitwidth of the IV. We prefer (potentially) keeping a truncate of the 927 // IV in the loop over a (potentially) expensive expansion of the widened 928 // exit count add(zext(add)) expression. 929 if (IndVar->getType()->isIntegerTy() && 930 SE->getTypeSizeInBits(AR->getType()) > 931 SE->getTypeSizeInBits(ExitCount->getType())) { 932 const SCEV *IVInit = AR->getStart(); 933 if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount)) 934 AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType())); 935 } 936 937 const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR; 938 const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE); 939 assert(SE->isLoopInvariant(IVLimit, L) && 940 "Computed iteration count is not loop invariant!"); 941 return Rewriter.expandCodeFor(IVLimit, ARBase->getType(), 942 ExitingBB->getTerminator()); 943 } 944 945 /// This method rewrites the exit condition of the loop to be a canonical != 946 /// comparison against the incremented loop induction variable. This pass is 947 /// able to rewrite the exit tests of any loop where the SCEV analysis can 948 /// determine a loop-invariant trip count of the loop, which is actually a much 949 /// broader range than just linear tests. 950 bool IndVarSimplify:: 951 linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 952 const SCEV *ExitCount, 953 PHINode *IndVar, SCEVExpander &Rewriter) { 954 assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 955 assert(isLoopCounter(IndVar, L, SE)); 956 Instruction * const IncVar = 957 cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 958 959 // Initialize CmpIndVar to the preincremented IV. 960 Value *CmpIndVar = IndVar; 961 bool UsePostInc = false; 962 963 // If the exiting block is the same as the backedge block, we prefer to 964 // compare against the post-incremented value, otherwise we must compare 965 // against the preincremented value. 966 if (ExitingBB == L->getLoopLatch()) { 967 // For pointer IVs, we chose to not strip inbounds which requires us not 968 // to add a potentially UB introducing use. We need to either a) show 969 // the loop test we're modifying is already in post-inc form, or b) show 970 // that adding a use must not introduce UB. 971 bool SafeToPostInc = 972 IndVar->getType()->isIntegerTy() || 973 isLoopExitTestBasedOn(IncVar, ExitingBB) || 974 mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 975 if (SafeToPostInc) { 976 UsePostInc = true; 977 CmpIndVar = IncVar; 978 } 979 } 980 981 // It may be necessary to drop nowrap flags on the incrementing instruction 982 // if either LFTR moves from a pre-inc check to a post-inc check (in which 983 // case the increment might have previously been poison on the last iteration 984 // only) or if LFTR switches to a different IV that was previously dynamically 985 // dead (and as such may be arbitrarily poison). We remove any nowrap flags 986 // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 987 // check), because the pre-inc addrec flags may be adopted from the original 988 // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 989 // TODO: This handling is inaccurate for one case: If we switch to a 990 // dynamically dead IV that wraps on the first loop iteration only, which is 991 // not covered by the post-inc addrec. (If the new IV was not dynamically 992 // dead, it could not be poison on the first iteration in the first place.) 993 if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 994 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 995 if (BO->hasNoUnsignedWrap()) 996 BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 997 if (BO->hasNoSignedWrap()) 998 BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 999 } 1000 1001 Value *ExitCnt = genLoopLimit( 1002 IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 1003 assert(ExitCnt->getType()->isPointerTy() == 1004 IndVar->getType()->isPointerTy() && 1005 "genLoopLimit missed a cast"); 1006 1007 // Insert a new icmp_ne or icmp_eq instruction before the branch. 1008 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1009 ICmpInst::Predicate P; 1010 if (L->contains(BI->getSuccessor(0))) 1011 P = ICmpInst::ICMP_NE; 1012 else 1013 P = ICmpInst::ICMP_EQ; 1014 1015 IRBuilder<> Builder(BI); 1016 1017 // The new loop exit condition should reuse the debug location of the 1018 // original loop exit condition. 1019 if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 1020 Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 1021 1022 // For integer IVs, if we evaluated the limit in the narrower bitwidth to 1023 // avoid the expensive expansion of the limit expression in the wider type, 1024 // emit a truncate to narrow the IV to the ExitCount type. This is safe 1025 // since we know (from the exit count bitwidth), that we can't self-wrap in 1026 // the narrower type. 1027 unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 1028 unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 1029 if (CmpIndVarSize > ExitCntSize) { 1030 assert(!CmpIndVar->getType()->isPointerTy() && 1031 !ExitCnt->getType()->isPointerTy()); 1032 1033 // Before resorting to actually inserting the truncate, use the same 1034 // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 1035 // the other side of the comparison instead. We still evaluate the limit 1036 // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 1037 // a truncate within in. 1038 bool Extended = false; 1039 const SCEV *IV = SE->getSCEV(CmpIndVar); 1040 const SCEV *TruncatedIV = SE->getTruncateExpr(IV, ExitCnt->getType()); 1041 const SCEV *ZExtTrunc = 1042 SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 1043 1044 if (ZExtTrunc == IV) { 1045 Extended = true; 1046 ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 1047 "wide.trip.count"); 1048 } else { 1049 const SCEV *SExtTrunc = 1050 SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 1051 if (SExtTrunc == IV) { 1052 Extended = true; 1053 ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 1054 "wide.trip.count"); 1055 } 1056 } 1057 1058 if (Extended) { 1059 bool Discard; 1060 L->makeLoopInvariant(ExitCnt, Discard); 1061 } else 1062 CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 1063 "lftr.wideiv"); 1064 } 1065 LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 1066 << " LHS:" << *CmpIndVar << '\n' 1067 << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 1068 << "\n" 1069 << " RHS:\t" << *ExitCnt << "\n" 1070 << "ExitCount:\t" << *ExitCount << "\n" 1071 << " was: " << *BI->getCondition() << "\n"); 1072 1073 Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 1074 Value *OrigCond = BI->getCondition(); 1075 // It's tempting to use replaceAllUsesWith here to fully replace the old 1076 // comparison, but that's not immediately safe, since users of the old 1077 // comparison may not be dominated by the new comparison. Instead, just 1078 // update the branch to use the new comparison; in the common case this 1079 // will make old comparison dead. 1080 BI->setCondition(Cond); 1081 DeadInsts.emplace_back(OrigCond); 1082 1083 ++NumLFTR; 1084 return true; 1085 } 1086 1087 //===----------------------------------------------------------------------===// 1088 // sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 1089 //===----------------------------------------------------------------------===// 1090 1091 /// If there's a single exit block, sink any loop-invariant values that 1092 /// were defined in the preheader but not used inside the loop into the 1093 /// exit block to reduce register pressure in the loop. 1094 bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 1095 BasicBlock *ExitBlock = L->getExitBlock(); 1096 if (!ExitBlock) return false; 1097 1098 BasicBlock *Preheader = L->getLoopPreheader(); 1099 if (!Preheader) return false; 1100 1101 bool MadeAnyChanges = false; 1102 BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); 1103 BasicBlock::iterator I(Preheader->getTerminator()); 1104 while (I != Preheader->begin()) { 1105 --I; 1106 // New instructions were inserted at the end of the preheader. 1107 if (isa<PHINode>(I)) 1108 break; 1109 1110 // Don't move instructions which might have side effects, since the side 1111 // effects need to complete before instructions inside the loop. Also don't 1112 // move instructions which might read memory, since the loop may modify 1113 // memory. Note that it's okay if the instruction might have undefined 1114 // behavior: LoopSimplify guarantees that the preheader dominates the exit 1115 // block. 1116 if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 1117 continue; 1118 1119 // Skip debug info intrinsics. 1120 if (isa<DbgInfoIntrinsic>(I)) 1121 continue; 1122 1123 // Skip eh pad instructions. 1124 if (I->isEHPad()) 1125 continue; 1126 1127 // Don't sink alloca: we never want to sink static alloca's out of the 1128 // entry block, and correctly sinking dynamic alloca's requires 1129 // checks for stacksave/stackrestore intrinsics. 1130 // FIXME: Refactor this check somehow? 1131 if (isa<AllocaInst>(I)) 1132 continue; 1133 1134 // Determine if there is a use in or before the loop (direct or 1135 // otherwise). 1136 bool UsedInLoop = false; 1137 for (Use &U : I->uses()) { 1138 Instruction *User = cast<Instruction>(U.getUser()); 1139 BasicBlock *UseBB = User->getParent(); 1140 if (PHINode *P = dyn_cast<PHINode>(User)) { 1141 unsigned i = 1142 PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 1143 UseBB = P->getIncomingBlock(i); 1144 } 1145 if (UseBB == Preheader || L->contains(UseBB)) { 1146 UsedInLoop = true; 1147 break; 1148 } 1149 } 1150 1151 // If there is, the def must remain in the preheader. 1152 if (UsedInLoop) 1153 continue; 1154 1155 // Otherwise, sink it to the exit block. 1156 Instruction *ToMove = &*I; 1157 bool Done = false; 1158 1159 if (I != Preheader->begin()) { 1160 // Skip debug info intrinsics. 1161 do { 1162 --I; 1163 } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); 1164 1165 if (I->isDebugOrPseudoInst() && I == Preheader->begin()) 1166 Done = true; 1167 } else { 1168 Done = true; 1169 } 1170 1171 MadeAnyChanges = true; 1172 ToMove->moveBefore(*ExitBlock, InsertPt); 1173 SE->forgetValue(ToMove); 1174 if (Done) break; 1175 InsertPt = ToMove->getIterator(); 1176 } 1177 1178 return MadeAnyChanges; 1179 } 1180 1181 static void replaceExitCond(BranchInst *BI, Value *NewCond, 1182 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1183 auto *OldCond = BI->getCondition(); 1184 LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI 1185 << " with " << *NewCond << "\n"); 1186 BI->setCondition(NewCond); 1187 if (OldCond->use_empty()) 1188 DeadInsts.emplace_back(OldCond); 1189 } 1190 1191 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB, 1192 bool IsTaken) { 1193 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1194 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1195 auto *OldCond = BI->getCondition(); 1196 return ConstantInt::get(OldCond->getType(), 1197 IsTaken ? ExitIfTrue : !ExitIfTrue); 1198 } 1199 1200 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, 1201 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1202 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1203 auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken); 1204 replaceExitCond(BI, NewCond, DeadInsts); 1205 } 1206 1207 static void replaceLoopPHINodesWithPreheaderValues( 1208 LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts, 1209 ScalarEvolution &SE) { 1210 assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); 1211 auto *LoopPreheader = L->getLoopPreheader(); 1212 auto *LoopHeader = L->getHeader(); 1213 SmallVector<Instruction *> Worklist; 1214 for (auto &PN : LoopHeader->phis()) { 1215 auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); 1216 for (User *U : PN.users()) 1217 Worklist.push_back(cast<Instruction>(U)); 1218 SE.forgetValue(&PN); 1219 PN.replaceAllUsesWith(PreheaderIncoming); 1220 DeadInsts.emplace_back(&PN); 1221 } 1222 1223 // Replacing with the preheader value will often allow IV users to simplify 1224 // (especially if the preheader value is a constant). 1225 SmallPtrSet<Instruction *, 16> Visited; 1226 while (!Worklist.empty()) { 1227 auto *I = cast<Instruction>(Worklist.pop_back_val()); 1228 if (!Visited.insert(I).second) 1229 continue; 1230 1231 // Don't simplify instructions outside the loop. 1232 if (!L->contains(I)) 1233 continue; 1234 1235 Value *Res = simplifyInstruction(I, I->getDataLayout()); 1236 if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { 1237 for (User *U : I->users()) 1238 Worklist.push_back(cast<Instruction>(U)); 1239 I->replaceAllUsesWith(Res); 1240 DeadInsts.emplace_back(I); 1241 } 1242 } 1243 } 1244 1245 static Value * 1246 createInvariantCond(const Loop *L, BasicBlock *ExitingBB, 1247 const ScalarEvolution::LoopInvariantPredicate &LIP, 1248 SCEVExpander &Rewriter) { 1249 ICmpInst::Predicate InvariantPred = LIP.Pred; 1250 BasicBlock *Preheader = L->getLoopPreheader(); 1251 assert(Preheader && "Preheader doesn't exist"); 1252 Rewriter.setInsertPoint(Preheader->getTerminator()); 1253 auto *LHSV = Rewriter.expandCodeFor(LIP.LHS); 1254 auto *RHSV = Rewriter.expandCodeFor(LIP.RHS); 1255 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1256 if (ExitIfTrue) 1257 InvariantPred = ICmpInst::getInversePredicate(InvariantPred); 1258 IRBuilder<> Builder(Preheader->getTerminator()); 1259 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1260 return Builder.CreateICmp(InvariantPred, LHSV, RHSV, 1261 BI->getCondition()->getName()); 1262 } 1263 1264 static std::optional<Value *> 1265 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB, 1266 const SCEV *MaxIter, bool Inverted, bool SkipLastIter, 1267 ScalarEvolution *SE, SCEVExpander &Rewriter) { 1268 ICmpInst::Predicate Pred = ICmp->getPredicate(); 1269 Value *LHS = ICmp->getOperand(0); 1270 Value *RHS = ICmp->getOperand(1); 1271 1272 // 'LHS pred RHS' should now mean that we stay in loop. 1273 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1274 if (Inverted) 1275 Pred = CmpInst::getInversePredicate(Pred); 1276 1277 const SCEV *LHSS = SE->getSCEVAtScope(LHS, L); 1278 const SCEV *RHSS = SE->getSCEVAtScope(RHS, L); 1279 // Can we prove it to be trivially true or false? 1280 if (auto EV = SE->evaluatePredicateAt(Pred, LHSS, RHSS, BI)) 1281 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ !*EV); 1282 1283 auto *ARTy = LHSS->getType(); 1284 auto *MaxIterTy = MaxIter->getType(); 1285 // If possible, adjust types. 1286 if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy)) 1287 MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy); 1288 else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) { 1289 const SCEV *MinusOne = SE->getMinusOne(ARTy); 1290 auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy); 1291 if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI)) 1292 MaxIter = SE->getTruncateExpr(MaxIter, ARTy); 1293 } 1294 1295 if (SkipLastIter) { 1296 // Semantically skip last iter is "subtract 1, do not bother about unsigned 1297 // wrap". getLoopInvariantExitCondDuringFirstIterations knows how to deal 1298 // with umin in a smart way, but umin(a, b) - 1 will likely not simplify. 1299 // So we manually construct umin(a - 1, b - 1). 1300 SmallVector<const SCEV *, 4> Elements; 1301 if (auto *UMin = dyn_cast<SCEVUMinExpr>(MaxIter)) { 1302 for (auto *Op : UMin->operands()) 1303 Elements.push_back(SE->getMinusSCEV(Op, SE->getOne(Op->getType()))); 1304 MaxIter = SE->getUMinFromMismatchedTypes(Elements); 1305 } else 1306 MaxIter = SE->getMinusSCEV(MaxIter, SE->getOne(MaxIter->getType())); 1307 } 1308 1309 // Check if there is a loop-invariant predicate equivalent to our check. 1310 auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS, 1311 L, BI, MaxIter); 1312 if (!LIP) 1313 return std::nullopt; 1314 1315 // Can we prove it to be trivially true? 1316 if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI)) 1317 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ false); 1318 else 1319 return createInvariantCond(L, ExitingBB, *LIP, Rewriter); 1320 } 1321 1322 static bool optimizeLoopExitWithUnknownExitCount( 1323 const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter, 1324 bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter, 1325 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1326 assert( 1327 (L->contains(BI->getSuccessor(0)) != L->contains(BI->getSuccessor(1))) && 1328 "Not a loop exit!"); 1329 1330 // For branch that stays in loop by TRUE condition, go through AND. For branch 1331 // that stays in loop by FALSE condition, go through OR. Both gives the 1332 // similar logic: "stay in loop iff all conditions are true(false)". 1333 bool Inverted = L->contains(BI->getSuccessor(1)); 1334 SmallVector<ICmpInst *, 4> LeafConditions; 1335 SmallVector<Value *, 4> Worklist; 1336 SmallPtrSet<Value *, 4> Visited; 1337 Value *OldCond = BI->getCondition(); 1338 Visited.insert(OldCond); 1339 Worklist.push_back(OldCond); 1340 1341 auto GoThrough = [&](Value *V) { 1342 Value *LHS = nullptr, *RHS = nullptr; 1343 if (Inverted) { 1344 if (!match(V, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) 1345 return false; 1346 } else { 1347 if (!match(V, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) 1348 return false; 1349 } 1350 if (Visited.insert(LHS).second) 1351 Worklist.push_back(LHS); 1352 if (Visited.insert(RHS).second) 1353 Worklist.push_back(RHS); 1354 return true; 1355 }; 1356 1357 do { 1358 Value *Curr = Worklist.pop_back_val(); 1359 // Go through AND/OR conditions. Collect leaf ICMPs. We only care about 1360 // those with one use, to avoid instruction duplication. 1361 if (Curr->hasOneUse()) 1362 if (!GoThrough(Curr)) 1363 if (auto *ICmp = dyn_cast<ICmpInst>(Curr)) 1364 LeafConditions.push_back(ICmp); 1365 } while (!Worklist.empty()); 1366 1367 // If the current basic block has the same exit count as the whole loop, and 1368 // it consists of multiple icmp's, try to collect all icmp's that give exact 1369 // same exit count. For all other icmp's, we could use one less iteration, 1370 // because their value on the last iteration doesn't really matter. 1371 SmallPtrSet<ICmpInst *, 4> ICmpsFailingOnLastIter; 1372 if (!SkipLastIter && LeafConditions.size() > 1 && 1373 SE->getExitCount(L, ExitingBB, 1374 ScalarEvolution::ExitCountKind::SymbolicMaximum) == 1375 MaxIter) 1376 for (auto *ICmp : LeafConditions) { 1377 auto EL = SE->computeExitLimitFromCond(L, ICmp, Inverted, 1378 /*ControlsExit*/ false); 1379 auto *ExitMax = EL.SymbolicMaxNotTaken; 1380 if (isa<SCEVCouldNotCompute>(ExitMax)) 1381 continue; 1382 // They could be of different types (specifically this happens after 1383 // IV widening). 1384 auto *WiderType = 1385 SE->getWiderType(ExitMax->getType(), MaxIter->getType()); 1386 auto *WideExitMax = SE->getNoopOrZeroExtend(ExitMax, WiderType); 1387 auto *WideMaxIter = SE->getNoopOrZeroExtend(MaxIter, WiderType); 1388 if (WideExitMax == WideMaxIter) 1389 ICmpsFailingOnLastIter.insert(ICmp); 1390 } 1391 1392 bool Changed = false; 1393 for (auto *OldCond : LeafConditions) { 1394 // Skip last iteration for this icmp under one of two conditions: 1395 // - We do it for all conditions; 1396 // - There is another ICmp that would fail on last iter, so this one doesn't 1397 // really matter. 1398 bool OptimisticSkipLastIter = SkipLastIter; 1399 if (!OptimisticSkipLastIter) { 1400 if (ICmpsFailingOnLastIter.size() > 1) 1401 OptimisticSkipLastIter = true; 1402 else if (ICmpsFailingOnLastIter.size() == 1) 1403 OptimisticSkipLastIter = !ICmpsFailingOnLastIter.count(OldCond); 1404 } 1405 if (auto Replaced = 1406 createReplacement(OldCond, L, ExitingBB, MaxIter, Inverted, 1407 OptimisticSkipLastIter, SE, Rewriter)) { 1408 Changed = true; 1409 auto *NewCond = *Replaced; 1410 if (auto *NCI = dyn_cast<Instruction>(NewCond)) { 1411 NCI->setName(OldCond->getName() + ".first_iter"); 1412 } 1413 LLVM_DEBUG(dbgs() << "Unknown exit count: Replacing " << *OldCond 1414 << " with " << *NewCond << "\n"); 1415 assert(OldCond->hasOneUse() && "Must be!"); 1416 OldCond->replaceAllUsesWith(NewCond); 1417 DeadInsts.push_back(OldCond); 1418 // Make sure we no longer consider this condition as failing on last 1419 // iteration. 1420 ICmpsFailingOnLastIter.erase(OldCond); 1421 } 1422 } 1423 return Changed; 1424 } 1425 1426 bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { 1427 // Note: This is duplicating a particular part on SimplifyIndVars reasoning. 1428 // We need to duplicate it because given icmp zext(small-iv), C, IVUsers 1429 // never reaches the icmp since the zext doesn't fold to an AddRec unless 1430 // it already has flags. The alternative to this would be to extending the 1431 // set of "interesting" IV users to include the icmp, but doing that 1432 // regresses results in practice by querying SCEVs before trip counts which 1433 // rely on them which results in SCEV caching sub-optimal answers. The 1434 // concern about caching sub-optimal results is why we only query SCEVs of 1435 // the loop invariant RHS here. 1436 SmallVector<BasicBlock*, 16> ExitingBlocks; 1437 L->getExitingBlocks(ExitingBlocks); 1438 bool Changed = false; 1439 for (auto *ExitingBB : ExitingBlocks) { 1440 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1441 if (!BI) 1442 continue; 1443 assert(BI->isConditional() && "exit branch must be conditional"); 1444 1445 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1446 if (!ICmp || !ICmp->hasOneUse()) 1447 continue; 1448 1449 auto *LHS = ICmp->getOperand(0); 1450 auto *RHS = ICmp->getOperand(1); 1451 // For the range reasoning, avoid computing SCEVs in the loop to avoid 1452 // poisoning cache with sub-optimal results. For the must-execute case, 1453 // this is a neccessary precondition for correctness. 1454 if (!L->isLoopInvariant(RHS)) { 1455 if (!L->isLoopInvariant(LHS)) 1456 continue; 1457 // Same logic applies for the inverse case 1458 std::swap(LHS, RHS); 1459 } 1460 1461 // Match (icmp signed-cond zext, RHS) 1462 Value *LHSOp = nullptr; 1463 if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) 1464 continue; 1465 1466 const DataLayout &DL = ExitingBB->getDataLayout(); 1467 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1468 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1469 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1470 FullCR = FullCR.zeroExtend(OuterBitWidth); 1471 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1472 if (FullCR.contains(RHSCR)) { 1473 // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus 1474 // replace the signed condition with the unsigned version. 1475 ICmp->setPredicate(ICmp->getUnsignedPredicate()); 1476 Changed = true; 1477 // Note: No SCEV invalidation needed. We've changed the predicate, but 1478 // have not changed exit counts, or the values produced by the compare. 1479 continue; 1480 } 1481 } 1482 1483 // Now that we've canonicalized the condition to match the extend, 1484 // see if we can rotate the extend out of the loop. 1485 for (auto *ExitingBB : ExitingBlocks) { 1486 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1487 if (!BI) 1488 continue; 1489 assert(BI->isConditional() && "exit branch must be conditional"); 1490 1491 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1492 if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) 1493 continue; 1494 1495 bool Swapped = false; 1496 auto *LHS = ICmp->getOperand(0); 1497 auto *RHS = ICmp->getOperand(1); 1498 if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) 1499 // Nothing to rotate 1500 continue; 1501 if (L->isLoopInvariant(LHS)) { 1502 // Same logic applies for the inverse case until we actually pick 1503 // which operand of the compare to update. 1504 Swapped = true; 1505 std::swap(LHS, RHS); 1506 } 1507 assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); 1508 1509 // Match (icmp unsigned-cond zext, RHS) 1510 // TODO: Extend to handle corresponding sext/signed-cmp case 1511 // TODO: Extend to other invertible functions 1512 Value *LHSOp = nullptr; 1513 if (!match(LHS, m_ZExt(m_Value(LHSOp)))) 1514 continue; 1515 1516 // In general, we only rotate if we can do so without increasing the number 1517 // of instructions. The exception is when we have an zext(add-rec). The 1518 // reason for allowing this exception is that we know we need to get rid 1519 // of the zext for SCEV to be able to compute a trip count for said loops; 1520 // we consider the new trip count valuable enough to increase instruction 1521 // count by one. 1522 if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) 1523 continue; 1524 1525 // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS 1526 // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) 1527 // when zext is loop varying and RHS is loop invariant. This converts 1528 // loop varying work to loop-invariant work. 1529 auto doRotateTransform = [&]() { 1530 assert(ICmp->isUnsigned() && "must have proven unsigned already"); 1531 auto *NewRHS = CastInst::Create( 1532 Instruction::Trunc, RHS, LHSOp->getType(), "", 1533 L->getLoopPreheader()->getTerminator()->getIterator()); 1534 ICmp->setOperand(Swapped ? 1 : 0, LHSOp); 1535 ICmp->setOperand(Swapped ? 0 : 1, NewRHS); 1536 if (LHS->use_empty()) 1537 DeadInsts.push_back(LHS); 1538 }; 1539 1540 1541 const DataLayout &DL = ExitingBB->getDataLayout(); 1542 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1543 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1544 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1545 FullCR = FullCR.zeroExtend(OuterBitWidth); 1546 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1547 if (FullCR.contains(RHSCR)) { 1548 doRotateTransform(); 1549 Changed = true; 1550 // Note, we are leaving SCEV in an unfortunately imprecise case here 1551 // as rotation tends to reveal information about trip counts not 1552 // previously visible. 1553 continue; 1554 } 1555 } 1556 1557 return Changed; 1558 } 1559 1560 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 1561 SmallVector<BasicBlock*, 16> ExitingBlocks; 1562 L->getExitingBlocks(ExitingBlocks); 1563 1564 // Remove all exits which aren't both rewriteable and execute on every 1565 // iteration. 1566 llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1567 // If our exitting block exits multiple loops, we can only rewrite the 1568 // innermost one. Otherwise, we're changing how many times the innermost 1569 // loop runs before it exits. 1570 if (LI->getLoopFor(ExitingBB) != L) 1571 return true; 1572 1573 // Can't rewrite non-branch yet. 1574 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1575 if (!BI) 1576 return true; 1577 1578 // Likewise, the loop latch must be dominated by the exiting BB. 1579 if (!DT->dominates(ExitingBB, L->getLoopLatch())) 1580 return true; 1581 1582 if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { 1583 // If already constant, nothing to do. However, if this is an 1584 // unconditional exit, we can still replace header phis with their 1585 // preheader value. 1586 if (!L->contains(BI->getSuccessor(CI->isNullValue()))) 1587 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1588 return true; 1589 } 1590 1591 return false; 1592 }); 1593 1594 if (ExitingBlocks.empty()) 1595 return false; 1596 1597 // Get a symbolic upper bound on the loop backedge taken count. 1598 const SCEV *MaxBECount = SE->getSymbolicMaxBackedgeTakenCount(L); 1599 if (isa<SCEVCouldNotCompute>(MaxBECount)) 1600 return false; 1601 1602 // Visit our exit blocks in order of dominance. We know from the fact that 1603 // all exits must dominate the latch, so there is a total dominance order 1604 // between them. 1605 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1606 // std::sort sorts in ascending order, so we want the inverse of 1607 // the normal dominance relation. 1608 if (A == B) return false; 1609 if (DT->properlyDominates(A, B)) 1610 return true; 1611 else { 1612 assert(DT->properlyDominates(B, A) && 1613 "expected total dominance order!"); 1614 return false; 1615 } 1616 }); 1617 #ifdef ASSERT 1618 for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 1619 assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 1620 } 1621 #endif 1622 1623 bool Changed = false; 1624 bool SkipLastIter = false; 1625 const SCEV *CurrMaxExit = SE->getCouldNotCompute(); 1626 auto UpdateSkipLastIter = [&](const SCEV *MaxExitCount) { 1627 if (SkipLastIter || isa<SCEVCouldNotCompute>(MaxExitCount)) 1628 return; 1629 if (isa<SCEVCouldNotCompute>(CurrMaxExit)) 1630 CurrMaxExit = MaxExitCount; 1631 else 1632 CurrMaxExit = SE->getUMinFromMismatchedTypes(CurrMaxExit, MaxExitCount); 1633 // If the loop has more than 1 iteration, all further checks will be 1634 // executed 1 iteration less. 1635 if (CurrMaxExit == MaxBECount) 1636 SkipLastIter = true; 1637 }; 1638 SmallSet<const SCEV *, 8> DominatingExactExitCounts; 1639 for (BasicBlock *ExitingBB : ExitingBlocks) { 1640 const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); 1641 const SCEV *MaxExitCount = SE->getExitCount( 1642 L, ExitingBB, ScalarEvolution::ExitCountKind::SymbolicMaximum); 1643 if (isa<SCEVCouldNotCompute>(ExactExitCount)) { 1644 // Okay, we do not know the exit count here. Can we at least prove that it 1645 // will remain the same within iteration space? 1646 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1647 auto OptimizeCond = [&](bool SkipLastIter) { 1648 return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB, 1649 MaxBECount, SkipLastIter, 1650 SE, Rewriter, DeadInsts); 1651 }; 1652 1653 // TODO: We might have proved that we can skip the last iteration for 1654 // this check. In this case, we only want to check the condition on the 1655 // pre-last iteration (MaxBECount - 1). However, there is a nasty 1656 // corner case: 1657 // 1658 // for (i = len; i != 0; i--) { ... check (i ult X) ... } 1659 // 1660 // If we could not prove that len != 0, then we also could not prove that 1661 // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then 1662 // OptimizeCond will likely not prove anything for it, even if it could 1663 // prove the same fact for len. 1664 // 1665 // As a temporary solution, we query both last and pre-last iterations in 1666 // hope that we will be able to prove triviality for at least one of 1667 // them. We can stop querying MaxBECount for this case once SCEV 1668 // understands that (MaxBECount - 1) will not overflow here. 1669 if (OptimizeCond(false)) 1670 Changed = true; 1671 else if (SkipLastIter && OptimizeCond(true)) 1672 Changed = true; 1673 UpdateSkipLastIter(MaxExitCount); 1674 continue; 1675 } 1676 1677 UpdateSkipLastIter(ExactExitCount); 1678 1679 // If we know we'd exit on the first iteration, rewrite the exit to 1680 // reflect this. This does not imply the loop must exit through this 1681 // exit; there may be an earlier one taken on the first iteration. 1682 // We know that the backedge can't be taken, so we replace all 1683 // the header PHIs with values coming from the preheader. 1684 if (ExactExitCount->isZero()) { 1685 foldExit(L, ExitingBB, true, DeadInsts); 1686 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1687 Changed = true; 1688 continue; 1689 } 1690 1691 assert(ExactExitCount->getType()->isIntegerTy() && 1692 MaxBECount->getType()->isIntegerTy() && 1693 "Exit counts must be integers"); 1694 1695 Type *WiderType = 1696 SE->getWiderType(MaxBECount->getType(), ExactExitCount->getType()); 1697 ExactExitCount = SE->getNoopOrZeroExtend(ExactExitCount, WiderType); 1698 MaxBECount = SE->getNoopOrZeroExtend(MaxBECount, WiderType); 1699 assert(MaxBECount->getType() == ExactExitCount->getType()); 1700 1701 // Can we prove that some other exit must be taken strictly before this 1702 // one? 1703 if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxBECount, 1704 ExactExitCount)) { 1705 foldExit(L, ExitingBB, false, DeadInsts); 1706 Changed = true; 1707 continue; 1708 } 1709 1710 // As we run, keep track of which exit counts we've encountered. If we 1711 // find a duplicate, we've found an exit which would have exited on the 1712 // exiting iteration, but (from the visit order) strictly follows another 1713 // which does the same and is thus dead. 1714 if (!DominatingExactExitCounts.insert(ExactExitCount).second) { 1715 foldExit(L, ExitingBB, false, DeadInsts); 1716 Changed = true; 1717 continue; 1718 } 1719 1720 // TODO: There might be another oppurtunity to leverage SCEV's reasoning 1721 // here. If we kept track of the min of dominanting exits so far, we could 1722 // discharge exits with EC >= MDEC. This is less powerful than the existing 1723 // transform (since later exits aren't considered), but potentially more 1724 // powerful for any case where SCEV can prove a >=u b, but neither a == b 1725 // or a >u b. Such a case is not currently known. 1726 } 1727 return Changed; 1728 } 1729 1730 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 1731 SmallVector<BasicBlock*, 16> ExitingBlocks; 1732 L->getExitingBlocks(ExitingBlocks); 1733 1734 // Finally, see if we can rewrite our exit conditions into a loop invariant 1735 // form. If we have a read-only loop, and we can tell that we must exit down 1736 // a path which does not need any of the values computed within the loop, we 1737 // can rewrite the loop to exit on the first iteration. Note that this 1738 // doesn't either a) tell us the loop exits on the first iteration (unless 1739 // *all* exits are predicateable) or b) tell us *which* exit might be taken. 1740 // This transformation looks a lot like a restricted form of dead loop 1741 // elimination, but restricted to read-only loops and without neccesssarily 1742 // needing to kill the loop entirely. 1743 if (!LoopPredication) 1744 return false; 1745 1746 // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 1747 // through *explicit* control flow. We have to eliminate the possibility of 1748 // implicit exits (see below) before we know it's truly exact. 1749 const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 1750 if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC)) 1751 return false; 1752 1753 assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); 1754 assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); 1755 1756 auto BadExit = [&](BasicBlock *ExitingBB) { 1757 // If our exiting block exits multiple loops, we can only rewrite the 1758 // innermost one. Otherwise, we're changing how many times the innermost 1759 // loop runs before it exits. 1760 if (LI->getLoopFor(ExitingBB) != L) 1761 return true; 1762 1763 // Can't rewrite non-branch yet. 1764 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1765 if (!BI) 1766 return true; 1767 1768 // If already constant, nothing to do. 1769 if (isa<Constant>(BI->getCondition())) 1770 return true; 1771 1772 // If the exit block has phis, we need to be able to compute the values 1773 // within the loop which contains them. This assumes trivially lcssa phis 1774 // have already been removed; TODO: generalize 1775 BasicBlock *ExitBlock = 1776 BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 1777 if (!ExitBlock->phis().empty()) 1778 return true; 1779 1780 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1781 if (isa<SCEVCouldNotCompute>(ExitCount) || 1782 !Rewriter.isSafeToExpand(ExitCount)) 1783 return true; 1784 1785 assert(SE->isLoopInvariant(ExitCount, L) && 1786 "Exit count must be loop invariant"); 1787 assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); 1788 return false; 1789 }; 1790 1791 // If we have any exits which can't be predicated themselves, than we can't 1792 // predicate any exit which isn't guaranteed to execute before it. Consider 1793 // two exits (a) and (b) which would both exit on the same iteration. If we 1794 // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 1795 // we could convert a loop from exiting through (a) to one exiting through 1796 // (b). Note that this problem exists only for exits with the same exit 1797 // count, and we could be more aggressive when exit counts are known inequal. 1798 llvm::sort(ExitingBlocks, 1799 [&](BasicBlock *A, BasicBlock *B) { 1800 // std::sort sorts in ascending order, so we want the inverse of 1801 // the normal dominance relation, plus a tie breaker for blocks 1802 // unordered by dominance. 1803 if (DT->properlyDominates(A, B)) return true; 1804 if (DT->properlyDominates(B, A)) return false; 1805 return A->getName() < B->getName(); 1806 }); 1807 // Check to see if our exit blocks are a total order (i.e. a linear chain of 1808 // exits before the backedge). If they aren't, reasoning about reachability 1809 // is complicated and we choose not to for now. 1810 for (unsigned i = 1; i < ExitingBlocks.size(); i++) 1811 if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) 1812 return false; 1813 1814 // Given our sorted total order, we know that exit[j] must be evaluated 1815 // after all exit[i] such j > i. 1816 for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 1817 if (BadExit(ExitingBlocks[i])) { 1818 ExitingBlocks.resize(i); 1819 break; 1820 } 1821 1822 if (ExitingBlocks.empty()) 1823 return false; 1824 1825 // We rely on not being able to reach an exiting block on a later iteration 1826 // then it's statically compute exit count. The implementaton of 1827 // getExitCount currently has this invariant, but assert it here so that 1828 // breakage is obvious if this ever changes.. 1829 assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1830 return DT->dominates(ExitingBB, L->getLoopLatch()); 1831 })); 1832 1833 // At this point, ExitingBlocks consists of only those blocks which are 1834 // predicatable. Given that, we know we have at least one exit we can 1835 // predicate if the loop is doesn't have side effects and doesn't have any 1836 // implicit exits (because then our exact BTC isn't actually exact). 1837 // @Reviewers - As structured, this is O(I^2) for loop nests. Any 1838 // suggestions on how to improve this? I can obviously bail out for outer 1839 // loops, but that seems less than ideal. MemorySSA can find memory writes, 1840 // is that enough for *all* side effects? 1841 for (BasicBlock *BB : L->blocks()) 1842 for (auto &I : *BB) 1843 // TODO:isGuaranteedToTransfer 1844 if (I.mayHaveSideEffects()) 1845 return false; 1846 1847 bool Changed = false; 1848 // Finally, do the actual predication for all predicatable blocks. A couple 1849 // of notes here: 1850 // 1) We don't bother to constant fold dominated exits with identical exit 1851 // counts; that's simply a form of CSE/equality propagation and we leave 1852 // it for dedicated passes. 1853 // 2) We insert the comparison at the branch. Hoisting introduces additional 1854 // legality constraints and we leave that to dedicated logic. We want to 1855 // predicate even if we can't insert a loop invariant expression as 1856 // peeling or unrolling will likely reduce the cost of the otherwise loop 1857 // varying check. 1858 Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 1859 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 1860 Value *ExactBTCV = nullptr; // Lazily generated if needed. 1861 for (BasicBlock *ExitingBB : ExitingBlocks) { 1862 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1863 1864 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1865 Value *NewCond; 1866 if (ExitCount == ExactBTC) { 1867 NewCond = L->contains(BI->getSuccessor(0)) ? 1868 B.getFalse() : B.getTrue(); 1869 } else { 1870 Value *ECV = Rewriter.expandCodeFor(ExitCount); 1871 if (!ExactBTCV) 1872 ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 1873 Value *RHS = ExactBTCV; 1874 if (ECV->getType() != RHS->getType()) { 1875 Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 1876 ECV = B.CreateZExt(ECV, WiderTy); 1877 RHS = B.CreateZExt(RHS, WiderTy); 1878 } 1879 auto Pred = L->contains(BI->getSuccessor(0)) ? 1880 ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 1881 NewCond = B.CreateICmp(Pred, ECV, RHS); 1882 } 1883 Value *OldCond = BI->getCondition(); 1884 BI->setCondition(NewCond); 1885 if (OldCond->use_empty()) 1886 DeadInsts.emplace_back(OldCond); 1887 Changed = true; 1888 RunUnswitching = true; 1889 } 1890 1891 return Changed; 1892 } 1893 1894 //===----------------------------------------------------------------------===// 1895 // IndVarSimplify driver. Manage several subpasses of IV simplification. 1896 //===----------------------------------------------------------------------===// 1897 1898 bool IndVarSimplify::run(Loop *L) { 1899 // We need (and expect!) the incoming loop to be in LCSSA. 1900 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 1901 "LCSSA required to run indvars!"); 1902 1903 // If LoopSimplify form is not available, stay out of trouble. Some notes: 1904 // - LSR currently only supports LoopSimplify-form loops. Indvars' 1905 // canonicalization can be a pessimization without LSR to "clean up" 1906 // afterwards. 1907 // - We depend on having a preheader; in particular, 1908 // Loop::getCanonicalInductionVariable only supports loops with preheaders, 1909 // and we're in trouble if we can't find the induction variable even when 1910 // we've manually inserted one. 1911 // - LFTR relies on having a single backedge. 1912 if (!L->isLoopSimplifyForm()) 1913 return false; 1914 1915 bool Changed = false; 1916 // If there are any floating-point recurrences, attempt to 1917 // transform them to use integer recurrences. 1918 Changed |= rewriteNonIntegerIVs(L); 1919 1920 // Create a rewriter object which we'll use to transform the code with. 1921 SCEVExpander Rewriter(*SE, DL, "indvars"); 1922 #ifndef NDEBUG 1923 Rewriter.setDebugType(DEBUG_TYPE); 1924 #endif 1925 1926 // Eliminate redundant IV users. 1927 // 1928 // Simplification works best when run before other consumers of SCEV. We 1929 // attempt to avoid evaluating SCEVs for sign/zero extend operations until 1930 // other expressions involving loop IVs have been evaluated. This helps SCEV 1931 // set no-wrap flags before normalizing sign/zero extension. 1932 Rewriter.disableCanonicalMode(); 1933 Changed |= simplifyAndExtend(L, Rewriter, LI); 1934 1935 // Check to see if we can compute the final value of any expressions 1936 // that are recurrent in the loop, and substitute the exit values from the 1937 // loop into any instructions outside of the loop that use the final values 1938 // of the current expressions. 1939 if (ReplaceExitValue != NeverRepl) { 1940 if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 1941 ReplaceExitValue, DeadInsts)) { 1942 NumReplaced += Rewrites; 1943 Changed = true; 1944 } 1945 } 1946 1947 // Eliminate redundant IV cycles. 1948 NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); 1949 1950 // Try to convert exit conditions to unsigned and rotate computation 1951 // out of the loop. Note: Handles invalidation internally if needed. 1952 Changed |= canonicalizeExitCondition(L); 1953 1954 // Try to eliminate loop exits based on analyzeable exit counts 1955 if (optimizeLoopExits(L, Rewriter)) { 1956 Changed = true; 1957 // Given we've changed exit counts, notify SCEV 1958 // Some nested loops may share same folded exit basic block, 1959 // thus we need to notify top most loop. 1960 SE->forgetTopmostLoop(L); 1961 } 1962 1963 // Try to form loop invariant tests for loop exits by changing how many 1964 // iterations of the loop run when that is unobservable. 1965 if (predicateLoopExits(L, Rewriter)) { 1966 Changed = true; 1967 // Given we've changed exit counts, notify SCEV 1968 SE->forgetLoop(L); 1969 } 1970 1971 // If we have a trip count expression, rewrite the loop's exit condition 1972 // using it. 1973 if (!DisableLFTR) { 1974 BasicBlock *PreHeader = L->getLoopPreheader(); 1975 1976 SmallVector<BasicBlock*, 16> ExitingBlocks; 1977 L->getExitingBlocks(ExitingBlocks); 1978 for (BasicBlock *ExitingBB : ExitingBlocks) { 1979 // Can't rewrite non-branch yet. 1980 if (!isa<BranchInst>(ExitingBB->getTerminator())) 1981 continue; 1982 1983 // If our exitting block exits multiple loops, we can only rewrite the 1984 // innermost one. Otherwise, we're changing how many times the innermost 1985 // loop runs before it exits. 1986 if (LI->getLoopFor(ExitingBB) != L) 1987 continue; 1988 1989 if (!needsLFTR(L, ExitingBB)) 1990 continue; 1991 1992 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1993 if (isa<SCEVCouldNotCompute>(ExitCount)) 1994 continue; 1995 1996 // This was handled above, but as we form SCEVs, we can sometimes refine 1997 // existing ones; this allows exit counts to be folded to zero which 1998 // weren't when optimizeLoopExits saw them. Arguably, we should iterate 1999 // until stable to handle cases like this better. 2000 if (ExitCount->isZero()) 2001 continue; 2002 2003 PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 2004 if (!IndVar) 2005 continue; 2006 2007 // Avoid high cost expansions. Note: This heuristic is questionable in 2008 // that our definition of "high cost" is not exactly principled. 2009 if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 2010 TTI, PreHeader->getTerminator())) 2011 continue; 2012 2013 if (!Rewriter.isSafeToExpand(ExitCount)) 2014 continue; 2015 2016 Changed |= linearFunctionTestReplace(L, ExitingBB, 2017 ExitCount, IndVar, 2018 Rewriter); 2019 } 2020 } 2021 // Clear the rewriter cache, because values that are in the rewriter's cache 2022 // can be deleted in the loop below, causing the AssertingVH in the cache to 2023 // trigger. 2024 Rewriter.clear(); 2025 2026 // Now that we're done iterating through lists, clean up any instructions 2027 // which are now dead. 2028 while (!DeadInsts.empty()) { 2029 Value *V = DeadInsts.pop_back_val(); 2030 2031 if (PHINode *PHI = dyn_cast_or_null<PHINode>(V)) 2032 Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get()); 2033 else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 2034 Changed |= 2035 RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 2036 } 2037 2038 // The Rewriter may not be used from this point on. 2039 2040 // Loop-invariant instructions in the preheader that aren't used in the 2041 // loop may be sunk below the loop to reduce register pressure. 2042 Changed |= sinkUnusedInvariants(L); 2043 2044 // rewriteFirstIterationLoopExitValues does not rely on the computation of 2045 // trip count and therefore can further simplify exit values in addition to 2046 // rewriteLoopExitValues. 2047 Changed |= rewriteFirstIterationLoopExitValues(L); 2048 2049 // Clean up dead instructions. 2050 Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 2051 2052 // Check a post-condition. 2053 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 2054 "Indvars did not preserve LCSSA!"); 2055 if (VerifyMemorySSA && MSSAU) 2056 MSSAU->getMemorySSA()->verifyMemorySSA(); 2057 2058 return Changed; 2059 } 2060 2061 PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 2062 LoopStandardAnalysisResults &AR, 2063 LPMUpdater &) { 2064 Function *F = L.getHeader()->getParent(); 2065 const DataLayout &DL = F->getDataLayout(); 2066 2067 IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, 2068 WidenIndVars && AllowIVWidening); 2069 if (!IVS.run(&L)) 2070 return PreservedAnalyses::all(); 2071 2072 auto PA = getLoopPassPreservedAnalyses(); 2073 PA.preserveSet<CFGAnalyses>(); 2074 if (IVS.runUnswitching()) { 2075 AM.getResult<ShouldRunExtraSimpleLoopUnswitch>(L, AR); 2076 PA.preserve<ShouldRunExtraSimpleLoopUnswitch>(); 2077 } 2078 2079 if (AR.MSSA) 2080 PA.preserve<MemorySSAAnalysis>(); 2081 return PA; 2082 } 2083