1 //===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This transformation analyzes and transforms the induction variables (and 10 // computations derived from them) into simpler forms suitable for subsequent 11 // analysis and transformation. 12 // 13 // If the trip count of a loop is computable, this pass also makes the following 14 // changes: 15 // 1. The exit condition for the loop is canonicalized to compare the 16 // induction value against the exit value. This turns loops like: 17 // 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 18 // 2. Any use outside of the loop of an expression derived from the indvar 19 // is changed to compute the derived value outside of the loop, eliminating 20 // the dependence on the exit value of the induction variable. If the only 21 // purpose of the loop is to compute the exit value of some derived 22 // expression, this transformation will make the loop dead. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/LoopInfo.h" 36 #include "llvm/Analysis/LoopPass.h" 37 #include "llvm/Analysis/MemorySSA.h" 38 #include "llvm/Analysis/MemorySSAUpdater.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/TargetLibraryInfo.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/Analysis/ValueTracking.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constant.h" 46 #include "llvm/IR/ConstantRange.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DataLayout.h" 49 #include "llvm/IR/DerivedTypes.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/IRBuilder.h" 53 #include "llvm/IR/InstrTypes.h" 54 #include "llvm/IR/Instruction.h" 55 #include "llvm/IR/Instructions.h" 56 #include "llvm/IR/IntrinsicInst.h" 57 #include "llvm/IR/Intrinsics.h" 58 #include "llvm/IR/Module.h" 59 #include "llvm/IR/Operator.h" 60 #include "llvm/IR/PassManager.h" 61 #include "llvm/IR/PatternMatch.h" 62 #include "llvm/IR/Type.h" 63 #include "llvm/IR/Use.h" 64 #include "llvm/IR/User.h" 65 #include "llvm/IR/Value.h" 66 #include "llvm/IR/ValueHandle.h" 67 #include "llvm/Support/Casting.h" 68 #include "llvm/Support/CommandLine.h" 69 #include "llvm/Support/Compiler.h" 70 #include "llvm/Support/Debug.h" 71 #include "llvm/Support/MathExtras.h" 72 #include "llvm/Support/raw_ostream.h" 73 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 74 #include "llvm/Transforms/Utils/Local.h" 75 #include "llvm/Transforms/Utils/LoopUtils.h" 76 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 77 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 78 #include <cassert> 79 #include <cstdint> 80 #include <utility> 81 82 using namespace llvm; 83 using namespace PatternMatch; 84 85 #define DEBUG_TYPE "indvars" 86 87 STATISTIC(NumWidened , "Number of indvars widened"); 88 STATISTIC(NumReplaced , "Number of exit values replaced"); 89 STATISTIC(NumLFTR , "Number of loop exit tests replaced"); 90 STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); 91 STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); 92 93 static cl::opt<ReplaceExitVal> ReplaceExitValue( 94 "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 95 cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 96 cl::values( 97 clEnumValN(NeverRepl, "never", "never replace exit value"), 98 clEnumValN(OnlyCheapRepl, "cheap", 99 "only replace exit value when the cost is cheap"), 100 clEnumValN( 101 UnusedIndVarInLoop, "unusedindvarinloop", 102 "only replace exit value when it is an unused " 103 "induction variable in the loop and has cheap replacement cost"), 104 clEnumValN(NoHardUse, "noharduse", 105 "only replace exit values when loop def likely dead"), 106 clEnumValN(AlwaysRepl, "always", 107 "always replace exit value whenever possible"))); 108 109 static cl::opt<bool> UsePostIncrementRanges( 110 "indvars-post-increment-ranges", cl::Hidden, 111 cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 112 cl::init(true)); 113 114 static cl::opt<bool> 115 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 116 cl::desc("Disable Linear Function Test Replace optimization")); 117 118 static cl::opt<bool> 119 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 120 cl::desc("Predicate conditions in read only loops")); 121 122 static cl::opt<bool> 123 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), 124 cl::desc("Allow widening of indvars to eliminate s/zext")); 125 126 namespace { 127 128 class IndVarSimplify { 129 LoopInfo *LI; 130 ScalarEvolution *SE; 131 DominatorTree *DT; 132 const DataLayout &DL; 133 TargetLibraryInfo *TLI; 134 const TargetTransformInfo *TTI; 135 std::unique_ptr<MemorySSAUpdater> MSSAU; 136 137 SmallVector<WeakTrackingVH, 16> DeadInsts; 138 bool WidenIndVars; 139 140 bool handleFloatingPointIV(Loop *L, PHINode *PH); 141 bool rewriteNonIntegerIVs(Loop *L); 142 143 bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 144 /// Try to improve our exit conditions by converting condition from signed 145 /// to unsigned or rotating computation out of the loop. 146 /// (See inline comment about why this is duplicated from simplifyAndExtend) 147 bool canonicalizeExitCondition(Loop *L); 148 /// Try to eliminate loop exits based on analyzeable exit counts 149 bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 150 /// Try to form loop invariant tests for loop exits by changing how many 151 /// iterations of the loop run when that is unobservable. 152 bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 153 154 bool rewriteFirstIterationLoopExitValues(Loop *L); 155 156 bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 157 const SCEV *ExitCount, 158 PHINode *IndVar, SCEVExpander &Rewriter); 159 160 bool sinkUnusedInvariants(Loop *L); 161 162 public: 163 IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 164 const DataLayout &DL, TargetLibraryInfo *TLI, 165 TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) 166 : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), 167 WidenIndVars(WidenIndVars) { 168 if (MSSA) 169 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 170 } 171 172 bool run(Loop *L); 173 }; 174 175 } // end anonymous namespace 176 177 //===----------------------------------------------------------------------===// 178 // rewriteNonIntegerIVs and helpers. Prefer integer IVs. 179 //===----------------------------------------------------------------------===// 180 181 /// Convert APF to an integer, if possible. 182 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 183 bool isExact = false; 184 // See if we can convert this to an int64_t 185 uint64_t UIntVal; 186 if (APF.convertToInteger(MutableArrayRef(UIntVal), 64, true, 187 APFloat::rmTowardZero, &isExact) != APFloat::opOK || 188 !isExact) 189 return false; 190 IntVal = UIntVal; 191 return true; 192 } 193 194 /// If the loop has floating induction variable then insert corresponding 195 /// integer induction variable if possible. 196 /// For example, 197 /// for(double i = 0; i < 10000; ++i) 198 /// bar(i) 199 /// is converted into 200 /// for(int i = 0; i < 10000; ++i) 201 /// bar((double)i); 202 bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 203 unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 204 unsigned BackEdge = IncomingEdge^1; 205 206 // Check incoming value. 207 auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 208 209 int64_t InitValue; 210 if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 211 return false; 212 213 // Check IV increment. Reject this PN if increment operation is not 214 // an add or increment value can not be represented by an integer. 215 auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 216 if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 217 218 // If this is not an add of the PHI with a constantfp, or if the constant fp 219 // is not an integer, bail out. 220 ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 221 int64_t IncValue; 222 if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 223 !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 224 return false; 225 226 // Check Incr uses. One user is PN and the other user is an exit condition 227 // used by the conditional terminator. 228 Value::user_iterator IncrUse = Incr->user_begin(); 229 Instruction *U1 = cast<Instruction>(*IncrUse++); 230 if (IncrUse == Incr->user_end()) return false; 231 Instruction *U2 = cast<Instruction>(*IncrUse++); 232 if (IncrUse != Incr->user_end()) return false; 233 234 // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't 235 // only used by a branch, we can't transform it. 236 FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 237 if (!Compare) 238 Compare = dyn_cast<FCmpInst>(U2); 239 if (!Compare || !Compare->hasOneUse() || 240 !isa<BranchInst>(Compare->user_back())) 241 return false; 242 243 BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 244 245 // We need to verify that the branch actually controls the iteration count 246 // of the loop. If not, the new IV can overflow and no one will notice. 247 // The branch block must be in the loop and one of the successors must be out 248 // of the loop. 249 assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 250 if (!L->contains(TheBr->getParent()) || 251 (L->contains(TheBr->getSuccessor(0)) && 252 L->contains(TheBr->getSuccessor(1)))) 253 return false; 254 255 // If it isn't a comparison with an integer-as-fp (the exit value), we can't 256 // transform it. 257 ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 258 int64_t ExitValue; 259 if (ExitValueVal == nullptr || 260 !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 261 return false; 262 263 // Find new predicate for integer comparison. 264 CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 265 switch (Compare->getPredicate()) { 266 default: return false; // Unknown comparison. 267 case CmpInst::FCMP_OEQ: 268 case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 269 case CmpInst::FCMP_ONE: 270 case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 271 case CmpInst::FCMP_OGT: 272 case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 273 case CmpInst::FCMP_OGE: 274 case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 275 case CmpInst::FCMP_OLT: 276 case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 277 case CmpInst::FCMP_OLE: 278 case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 279 } 280 281 // We convert the floating point induction variable to a signed i32 value if 282 // we can. This is only safe if the comparison will not overflow in a way 283 // that won't be trapped by the integer equivalent operations. Check for this 284 // now. 285 // TODO: We could use i64 if it is native and the range requires it. 286 287 // The start/stride/exit values must all fit in signed i32. 288 if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 289 return false; 290 291 // If not actually striding (add x, 0.0), avoid touching the code. 292 if (IncValue == 0) 293 return false; 294 295 // Positive and negative strides have different safety conditions. 296 if (IncValue > 0) { 297 // If we have a positive stride, we require the init to be less than the 298 // exit value. 299 if (InitValue >= ExitValue) 300 return false; 301 302 uint32_t Range = uint32_t(ExitValue-InitValue); 303 // Check for infinite loop, either: 304 // while (i <= Exit) or until (i > Exit) 305 if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 306 if (++Range == 0) return false; // Range overflows. 307 } 308 309 unsigned Leftover = Range % uint32_t(IncValue); 310 311 // If this is an equality comparison, we require that the strided value 312 // exactly land on the exit value, otherwise the IV condition will wrap 313 // around and do things the fp IV wouldn't. 314 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 315 Leftover != 0) 316 return false; 317 318 // If the stride would wrap around the i32 before exiting, we can't 319 // transform the IV. 320 if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 321 return false; 322 } else { 323 // If we have a negative stride, we require the init to be greater than the 324 // exit value. 325 if (InitValue <= ExitValue) 326 return false; 327 328 uint32_t Range = uint32_t(InitValue-ExitValue); 329 // Check for infinite loop, either: 330 // while (i >= Exit) or until (i < Exit) 331 if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 332 if (++Range == 0) return false; // Range overflows. 333 } 334 335 unsigned Leftover = Range % uint32_t(-IncValue); 336 337 // If this is an equality comparison, we require that the strided value 338 // exactly land on the exit value, otherwise the IV condition will wrap 339 // around and do things the fp IV wouldn't. 340 if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 341 Leftover != 0) 342 return false; 343 344 // If the stride would wrap around the i32 before exiting, we can't 345 // transform the IV. 346 if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 347 return false; 348 } 349 350 IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 351 352 // Insert new integer induction variable. 353 PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); 354 NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), 355 PN->getIncomingBlock(IncomingEdge)); 356 357 Value *NewAdd = 358 BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), 359 Incr->getName()+".int", Incr); 360 NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 361 362 ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, 363 ConstantInt::get(Int32Ty, ExitValue), 364 Compare->getName()); 365 366 // In the following deletions, PN may become dead and may be deleted. 367 // Use a WeakTrackingVH to observe whether this happens. 368 WeakTrackingVH WeakPH = PN; 369 370 // Delete the old floating point exit comparison. The branch starts using the 371 // new comparison. 372 NewCompare->takeName(Compare); 373 Compare->replaceAllUsesWith(NewCompare); 374 RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 375 376 // Delete the old floating point increment. 377 Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); 378 RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 379 380 // If the FP induction variable still has uses, this is because something else 381 // in the loop uses its value. In order to canonicalize the induction 382 // variable, we chose to eliminate the IV and rewrite it in terms of an 383 // int->fp cast. 384 // 385 // We give preference to sitofp over uitofp because it is faster on most 386 // platforms. 387 if (WeakPH) { 388 Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 389 &*PN->getParent()->getFirstInsertionPt()); 390 PN->replaceAllUsesWith(Conv); 391 RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 392 } 393 return true; 394 } 395 396 bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 397 // First step. Check to see if there are any floating-point recurrences. 398 // If there are, change them into integer recurrences, permitting analysis by 399 // the SCEV routines. 400 BasicBlock *Header = L->getHeader(); 401 402 SmallVector<WeakTrackingVH, 8> PHIs; 403 for (PHINode &PN : Header->phis()) 404 PHIs.push_back(&PN); 405 406 bool Changed = false; 407 for (WeakTrackingVH &PHI : PHIs) 408 if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHI)) 409 Changed |= handleFloatingPointIV(L, PN); 410 411 // If the loop previously had floating-point IV, ScalarEvolution 412 // may not have been able to compute a trip count. Now that we've done some 413 // re-writing, the trip count may be computable. 414 if (Changed) 415 SE->forgetLoop(L); 416 return Changed; 417 } 418 419 //===---------------------------------------------------------------------===// 420 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 421 // they will exit at the first iteration. 422 //===---------------------------------------------------------------------===// 423 424 /// Check to see if this loop has loop invariant conditions which lead to loop 425 /// exits. If so, we know that if the exit path is taken, it is at the first 426 /// loop iteration. This lets us predict exit values of PHI nodes that live in 427 /// loop header. 428 bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 429 // Verify the input to the pass is already in LCSSA form. 430 assert(L->isLCSSAForm(*DT)); 431 432 SmallVector<BasicBlock *, 8> ExitBlocks; 433 L->getUniqueExitBlocks(ExitBlocks); 434 435 bool MadeAnyChanges = false; 436 for (auto *ExitBB : ExitBlocks) { 437 // If there are no more PHI nodes in this exit block, then no more 438 // values defined inside the loop are used on this path. 439 for (PHINode &PN : ExitBB->phis()) { 440 for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 441 IncomingValIdx != E; ++IncomingValIdx) { 442 auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 443 444 // Can we prove that the exit must run on the first iteration if it 445 // runs at all? (i.e. early exits are fine for our purposes, but 446 // traces which lead to this exit being taken on the 2nd iteration 447 // aren't.) Note that this is about whether the exit branch is 448 // executed, not about whether it is taken. 449 if (!L->getLoopLatch() || 450 !DT->dominates(IncomingBB, L->getLoopLatch())) 451 continue; 452 453 // Get condition that leads to the exit path. 454 auto *TermInst = IncomingBB->getTerminator(); 455 456 Value *Cond = nullptr; 457 if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 458 // Must be a conditional branch, otherwise the block 459 // should not be in the loop. 460 Cond = BI->getCondition(); 461 } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 462 Cond = SI->getCondition(); 463 else 464 continue; 465 466 if (!L->isLoopInvariant(Cond)) 467 continue; 468 469 auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 470 471 // Only deal with PHIs in the loop header. 472 if (!ExitVal || ExitVal->getParent() != L->getHeader()) 473 continue; 474 475 // If ExitVal is a PHI on the loop header, then we know its 476 // value along this exit because the exit can only be taken 477 // on the first iteration. 478 auto *LoopPreheader = L->getLoopPreheader(); 479 assert(LoopPreheader && "Invalid loop"); 480 int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 481 if (PreheaderIdx != -1) { 482 assert(ExitVal->getParent() == L->getHeader() && 483 "ExitVal must be in loop header"); 484 MadeAnyChanges = true; 485 PN.setIncomingValue(IncomingValIdx, 486 ExitVal->getIncomingValue(PreheaderIdx)); 487 SE->forgetValue(&PN); 488 } 489 } 490 } 491 } 492 return MadeAnyChanges; 493 } 494 495 //===----------------------------------------------------------------------===// 496 // IV Widening - Extend the width of an IV to cover its widest uses. 497 //===----------------------------------------------------------------------===// 498 499 /// Update information about the induction variable that is extended by this 500 /// sign or zero extend operation. This is used to determine the final width of 501 /// the IV before actually widening it. 502 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, 503 ScalarEvolution *SE, 504 const TargetTransformInfo *TTI) { 505 bool IsSigned = Cast->getOpcode() == Instruction::SExt; 506 if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 507 return; 508 509 Type *Ty = Cast->getType(); 510 uint64_t Width = SE->getTypeSizeInBits(Ty); 511 if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) 512 return; 513 514 // Check that `Cast` actually extends the induction variable (we rely on this 515 // later). This takes care of cases where `Cast` is extending a truncation of 516 // the narrow induction variable, and thus can end up being narrower than the 517 // "narrow" induction variable. 518 uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 519 if (NarrowIVWidth >= Width) 520 return; 521 522 // Cast is either an sext or zext up to this point. 523 // We should not widen an indvar if arithmetics on the wider indvar are more 524 // expensive than those on the narrower indvar. We check only the cost of ADD 525 // because at least an ADD is required to increment the induction variable. We 526 // could compute more comprehensively the cost of all instructions on the 527 // induction variable when necessary. 528 if (TTI && 529 TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 530 TTI->getArithmeticInstrCost(Instruction::Add, 531 Cast->getOperand(0)->getType())) { 532 return; 533 } 534 535 if (!WI.WidestNativeType || 536 Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { 537 WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 538 WI.IsSigned = IsSigned; 539 return; 540 } 541 542 // We extend the IV to satisfy the sign of its user(s), or 'signed' 543 // if there are multiple users with both sign- and zero extensions, 544 // in order not to introduce nondeterministic behaviour based on the 545 // unspecified order of a PHI nodes' users-iterator. 546 WI.IsSigned |= IsSigned; 547 } 548 549 //===----------------------------------------------------------------------===// 550 // Live IV Reduction - Minimize IVs live across the loop. 551 //===----------------------------------------------------------------------===// 552 553 //===----------------------------------------------------------------------===// 554 // Simplification of IV users based on SCEV evaluation. 555 //===----------------------------------------------------------------------===// 556 557 namespace { 558 559 class IndVarSimplifyVisitor : public IVVisitor { 560 ScalarEvolution *SE; 561 const TargetTransformInfo *TTI; 562 PHINode *IVPhi; 563 564 public: 565 WideIVInfo WI; 566 567 IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 568 const TargetTransformInfo *TTI, 569 const DominatorTree *DTree) 570 : SE(SCEV), TTI(TTI), IVPhi(IV) { 571 DT = DTree; 572 WI.NarrowIV = IVPhi; 573 } 574 575 // Implement the interface used by simplifyUsersOfIV. 576 void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 577 }; 578 579 } // end anonymous namespace 580 581 /// Iteratively perform simplification on a worklist of IV users. Each 582 /// successive simplification may push more users which may themselves be 583 /// candidates for simplification. 584 /// 585 /// Sign/Zero extend elimination is interleaved with IV simplification. 586 bool IndVarSimplify::simplifyAndExtend(Loop *L, 587 SCEVExpander &Rewriter, 588 LoopInfo *LI) { 589 SmallVector<WideIVInfo, 8> WideIVs; 590 591 auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( 592 Intrinsic::getName(Intrinsic::experimental_guard)); 593 bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 594 595 SmallVector<PHINode *, 8> LoopPhis; 596 for (PHINode &PN : L->getHeader()->phis()) 597 LoopPhis.push_back(&PN); 598 599 // Each round of simplification iterates through the SimplifyIVUsers worklist 600 // for all current phis, then determines whether any IVs can be 601 // widened. Widening adds new phis to LoopPhis, inducing another round of 602 // simplification on the wide IVs. 603 bool Changed = false; 604 while (!LoopPhis.empty()) { 605 // Evaluate as many IV expressions as possible before widening any IVs. This 606 // forces SCEV to set no-wrap flags before evaluating sign/zero 607 // extension. The first time SCEV attempts to normalize sign/zero extension, 608 // the result becomes final. So for the most predictable results, we delay 609 // evaluation of sign/zero extend evaluation until needed, and avoid running 610 // other SCEV based analysis prior to simplifyAndExtend. 611 do { 612 PHINode *CurrIV = LoopPhis.pop_back_val(); 613 614 // Information about sign/zero extensions of CurrIV. 615 IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 616 617 Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter, 618 &Visitor); 619 620 if (Visitor.WI.WidestNativeType) { 621 WideIVs.push_back(Visitor.WI); 622 } 623 } while(!LoopPhis.empty()); 624 625 // Continue if we disallowed widening. 626 if (!WidenIndVars) 627 continue; 628 629 for (; !WideIVs.empty(); WideIVs.pop_back()) { 630 unsigned ElimExt; 631 unsigned Widened; 632 if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter, 633 DT, DeadInsts, ElimExt, Widened, 634 HasGuards, UsePostIncrementRanges)) { 635 NumElimExt += ElimExt; 636 NumWidened += Widened; 637 Changed = true; 638 LoopPhis.push_back(WidePhi); 639 } 640 } 641 } 642 return Changed; 643 } 644 645 //===----------------------------------------------------------------------===// 646 // linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 647 //===----------------------------------------------------------------------===// 648 649 /// Given an Value which is hoped to be part of an add recurance in the given 650 /// loop, return the associated Phi node if so. Otherwise, return null. Note 651 /// that this is less general than SCEVs AddRec checking. 652 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 653 Instruction *IncI = dyn_cast<Instruction>(IncV); 654 if (!IncI) 655 return nullptr; 656 657 switch (IncI->getOpcode()) { 658 case Instruction::Add: 659 case Instruction::Sub: 660 break; 661 case Instruction::GetElementPtr: 662 // An IV counter must preserve its type. 663 if (IncI->getNumOperands() == 2) 664 break; 665 [[fallthrough]]; 666 default: 667 return nullptr; 668 } 669 670 PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 671 if (Phi && Phi->getParent() == L->getHeader()) { 672 if (L->isLoopInvariant(IncI->getOperand(1))) 673 return Phi; 674 return nullptr; 675 } 676 if (IncI->getOpcode() == Instruction::GetElementPtr) 677 return nullptr; 678 679 // Allow add/sub to be commuted. 680 Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 681 if (Phi && Phi->getParent() == L->getHeader()) { 682 if (L->isLoopInvariant(IncI->getOperand(0))) 683 return Phi; 684 } 685 return nullptr; 686 } 687 688 /// Whether the current loop exit test is based on this value. Currently this 689 /// is limited to a direct use in the loop condition. 690 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 691 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 692 ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 693 // TODO: Allow non-icmp loop test. 694 if (!ICmp) 695 return false; 696 697 // TODO: Allow indirect use. 698 return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 699 } 700 701 /// linearFunctionTestReplace policy. Return true unless we can show that the 702 /// current exit test is already sufficiently canonical. 703 static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 704 assert(L->getLoopLatch() && "Must be in simplified form"); 705 706 // Avoid converting a constant or loop invariant test back to a runtime 707 // test. This is critical for when SCEV's cached ExitCount is less precise 708 // than the current IR (such as after we've proven a particular exit is 709 // actually dead and thus the BE count never reaches our ExitCount.) 710 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 711 if (L->isLoopInvariant(BI->getCondition())) 712 return false; 713 714 // Do LFTR to simplify the exit condition to an ICMP. 715 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 716 if (!Cond) 717 return true; 718 719 // Do LFTR to simplify the exit ICMP to EQ/NE 720 ICmpInst::Predicate Pred = Cond->getPredicate(); 721 if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 722 return true; 723 724 // Look for a loop invariant RHS 725 Value *LHS = Cond->getOperand(0); 726 Value *RHS = Cond->getOperand(1); 727 if (!L->isLoopInvariant(RHS)) { 728 if (!L->isLoopInvariant(LHS)) 729 return true; 730 std::swap(LHS, RHS); 731 } 732 // Look for a simple IV counter LHS 733 PHINode *Phi = dyn_cast<PHINode>(LHS); 734 if (!Phi) 735 Phi = getLoopPhiForCounter(LHS, L); 736 737 if (!Phi) 738 return true; 739 740 // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 741 int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 742 if (Idx < 0) 743 return true; 744 745 // Do LFTR if the exit condition's IV is *not* a simple counter. 746 Value *IncV = Phi->getIncomingValue(Idx); 747 return Phi != getLoopPhiForCounter(IncV, L); 748 } 749 750 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 751 /// down to checking that all operands are constant and listing instructions 752 /// that may hide undef. 753 static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 754 unsigned Depth) { 755 if (isa<Constant>(V)) 756 return !isa<UndefValue>(V); 757 758 if (Depth >= 6) 759 return false; 760 761 // Conservatively handle non-constant non-instructions. For example, Arguments 762 // may be undef. 763 Instruction *I = dyn_cast<Instruction>(V); 764 if (!I) 765 return false; 766 767 // Load and return values may be undef. 768 if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 769 return false; 770 771 // Optimistically handle other instructions. 772 for (Value *Op : I->operands()) { 773 if (!Visited.insert(Op).second) 774 continue; 775 if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 776 return false; 777 } 778 return true; 779 } 780 781 /// Return true if the given value is concrete. We must prove that undef can 782 /// never reach it. 783 /// 784 /// TODO: If we decide that this is a good approach to checking for undef, we 785 /// may factor it into a common location. 786 static bool hasConcreteDef(Value *V) { 787 SmallPtrSet<Value*, 8> Visited; 788 Visited.insert(V); 789 return hasConcreteDefImpl(V, Visited, 0); 790 } 791 792 /// Return true if the given phi is a "counter" in L. A counter is an 793 /// add recurance (of integer or pointer type) with an arbitrary start, and a 794 /// step of 1. Note that L must have exactly one latch. 795 static bool isLoopCounter(PHINode* Phi, Loop *L, 796 ScalarEvolution *SE) { 797 assert(Phi->getParent() == L->getHeader()); 798 assert(L->getLoopLatch()); 799 800 if (!SE->isSCEVable(Phi->getType())) 801 return false; 802 803 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 804 if (!AR || AR->getLoop() != L || !AR->isAffine()) 805 return false; 806 807 const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 808 if (!Step || !Step->isOne()) 809 return false; 810 811 int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 812 Value *IncV = Phi->getIncomingValue(LatchIdx); 813 return (getLoopPhiForCounter(IncV, L) == Phi && 814 isa<SCEVAddRecExpr>(SE->getSCEV(IncV))); 815 } 816 817 /// Search the loop header for a loop counter (anadd rec w/step of one) 818 /// suitable for use by LFTR. If multiple counters are available, select the 819 /// "best" one based profitable heuristics. 820 /// 821 /// BECount may be an i8* pointer type. The pointer difference is already 822 /// valid count without scaling the address stride, so it remains a pointer 823 /// expression as far as SCEV is concerned. 824 static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 825 const SCEV *BECount, 826 ScalarEvolution *SE, DominatorTree *DT) { 827 uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 828 829 Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 830 831 // Loop over all of the PHI nodes, looking for a simple counter. 832 PHINode *BestPhi = nullptr; 833 const SCEV *BestInit = nullptr; 834 BasicBlock *LatchBlock = L->getLoopLatch(); 835 assert(LatchBlock && "Must be in simplified form"); 836 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 837 838 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 839 PHINode *Phi = cast<PHINode>(I); 840 if (!isLoopCounter(Phi, L, SE)) 841 continue; 842 843 const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 844 845 // AR may be a pointer type, while BECount is an integer type. 846 // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 847 // AR may not be a narrower type, or we may never exit. 848 uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 849 if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 850 continue; 851 852 // Avoid reusing a potentially undef value to compute other values that may 853 // have originally had a concrete definition. 854 if (!hasConcreteDef(Phi)) { 855 // We explicitly allow unknown phis as long as they are already used by 856 // the loop exit test. This is legal since performing LFTR could not 857 // increase the number of undef users. 858 Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 859 if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 860 !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 861 continue; 862 } 863 864 // Avoid introducing undefined behavior due to poison which didn't exist in 865 // the original program. (Annoyingly, the rules for poison and undef 866 // propagation are distinct, so this does NOT cover the undef case above.) 867 // We have to ensure that we don't introduce UB by introducing a use on an 868 // iteration where said IV produces poison. Our strategy here differs for 869 // pointers and integer IVs. For integers, we strip and reinfer as needed, 870 // see code in linearFunctionTestReplace. For pointers, we restrict 871 // transforms as there is no good way to reinfer inbounds once lost. 872 if (!Phi->getType()->isIntegerTy() && 873 !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 874 continue; 875 876 const SCEV *Init = AR->getStart(); 877 878 if (BestPhi && !isAlmostDeadIV(BestPhi, LatchBlock, Cond)) { 879 // Don't force a live loop counter if another IV can be used. 880 if (isAlmostDeadIV(Phi, LatchBlock, Cond)) 881 continue; 882 883 // Prefer to count-from-zero. This is a more "canonical" counter form. It 884 // also prefers integer to pointer IVs. 885 if (BestInit->isZero() != Init->isZero()) { 886 if (BestInit->isZero()) 887 continue; 888 } 889 // If two IVs both count from zero or both count from nonzero then the 890 // narrower is likely a dead phi that has been widened. Use the wider phi 891 // to allow the other to be eliminated. 892 else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 893 continue; 894 } 895 BestPhi = Phi; 896 BestInit = Init; 897 } 898 return BestPhi; 899 } 900 901 /// Insert an IR expression which computes the value held by the IV IndVar 902 /// (which must be an loop counter w/unit stride) after the backedge of loop L 903 /// is taken ExitCount times. 904 static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 905 const SCEV *ExitCount, bool UsePostInc, Loop *L, 906 SCEVExpander &Rewriter, ScalarEvolution *SE) { 907 assert(isLoopCounter(IndVar, L, SE)); 908 assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer"); 909 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 910 assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 911 912 // For integer IVs, truncate the IV before computing the limit unless we 913 // know apriori that the limit must be a constant when evaluated in the 914 // bitwidth of the IV. We prefer (potentially) keeping a truncate of the 915 // IV in the loop over a (potentially) expensive expansion of the widened 916 // exit count add(zext(add)) expression. 917 if (IndVar->getType()->isIntegerTy() && 918 SE->getTypeSizeInBits(AR->getType()) > 919 SE->getTypeSizeInBits(ExitCount->getType())) { 920 const SCEV *IVInit = AR->getStart(); 921 if (!isa<SCEVConstant>(IVInit) || !isa<SCEVConstant>(ExitCount)) 922 AR = cast<SCEVAddRecExpr>(SE->getTruncateExpr(AR, ExitCount->getType())); 923 } 924 925 const SCEVAddRecExpr *ARBase = UsePostInc ? AR->getPostIncExpr(*SE) : AR; 926 const SCEV *IVLimit = ARBase->evaluateAtIteration(ExitCount, *SE); 927 assert(SE->isLoopInvariant(IVLimit, L) && 928 "Computed iteration count is not loop invariant!"); 929 return Rewriter.expandCodeFor(IVLimit, ARBase->getType(), 930 ExitingBB->getTerminator()); 931 } 932 933 /// This method rewrites the exit condition of the loop to be a canonical != 934 /// comparison against the incremented loop induction variable. This pass is 935 /// able to rewrite the exit tests of any loop where the SCEV analysis can 936 /// determine a loop-invariant trip count of the loop, which is actually a much 937 /// broader range than just linear tests. 938 bool IndVarSimplify:: 939 linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 940 const SCEV *ExitCount, 941 PHINode *IndVar, SCEVExpander &Rewriter) { 942 assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 943 assert(isLoopCounter(IndVar, L, SE)); 944 Instruction * const IncVar = 945 cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 946 947 // Initialize CmpIndVar to the preincremented IV. 948 Value *CmpIndVar = IndVar; 949 bool UsePostInc = false; 950 951 // If the exiting block is the same as the backedge block, we prefer to 952 // compare against the post-incremented value, otherwise we must compare 953 // against the preincremented value. 954 if (ExitingBB == L->getLoopLatch()) { 955 // For pointer IVs, we chose to not strip inbounds which requires us not 956 // to add a potentially UB introducing use. We need to either a) show 957 // the loop test we're modifying is already in post-inc form, or b) show 958 // that adding a use must not introduce UB. 959 bool SafeToPostInc = 960 IndVar->getType()->isIntegerTy() || 961 isLoopExitTestBasedOn(IncVar, ExitingBB) || 962 mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 963 if (SafeToPostInc) { 964 UsePostInc = true; 965 CmpIndVar = IncVar; 966 } 967 } 968 969 // It may be necessary to drop nowrap flags on the incrementing instruction 970 // if either LFTR moves from a pre-inc check to a post-inc check (in which 971 // case the increment might have previously been poison on the last iteration 972 // only) or if LFTR switches to a different IV that was previously dynamically 973 // dead (and as such may be arbitrarily poison). We remove any nowrap flags 974 // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 975 // check), because the pre-inc addrec flags may be adopted from the original 976 // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 977 // TODO: This handling is inaccurate for one case: If we switch to a 978 // dynamically dead IV that wraps on the first loop iteration only, which is 979 // not covered by the post-inc addrec. (If the new IV was not dynamically 980 // dead, it could not be poison on the first iteration in the first place.) 981 if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 982 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 983 if (BO->hasNoUnsignedWrap()) 984 BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 985 if (BO->hasNoSignedWrap()) 986 BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 987 } 988 989 Value *ExitCnt = genLoopLimit( 990 IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 991 assert(ExitCnt->getType()->isPointerTy() == 992 IndVar->getType()->isPointerTy() && 993 "genLoopLimit missed a cast"); 994 995 // Insert a new icmp_ne or icmp_eq instruction before the branch. 996 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 997 ICmpInst::Predicate P; 998 if (L->contains(BI->getSuccessor(0))) 999 P = ICmpInst::ICMP_NE; 1000 else 1001 P = ICmpInst::ICMP_EQ; 1002 1003 IRBuilder<> Builder(BI); 1004 1005 // The new loop exit condition should reuse the debug location of the 1006 // original loop exit condition. 1007 if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 1008 Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 1009 1010 // For integer IVs, if we evaluated the limit in the narrower bitwidth to 1011 // avoid the expensive expansion of the limit expression in the wider type, 1012 // emit a truncate to narrow the IV to the ExitCount type. This is safe 1013 // since we know (from the exit count bitwidth), that we can't self-wrap in 1014 // the narrower type. 1015 unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 1016 unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 1017 if (CmpIndVarSize > ExitCntSize) { 1018 assert(!CmpIndVar->getType()->isPointerTy() && 1019 !ExitCnt->getType()->isPointerTy()); 1020 1021 // Before resorting to actually inserting the truncate, use the same 1022 // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 1023 // the other side of the comparison instead. We still evaluate the limit 1024 // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 1025 // a truncate within in. 1026 bool Extended = false; 1027 const SCEV *IV = SE->getSCEV(CmpIndVar); 1028 const SCEV *TruncatedIV = SE->getTruncateExpr(IV, ExitCnt->getType()); 1029 const SCEV *ZExtTrunc = 1030 SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 1031 1032 if (ZExtTrunc == IV) { 1033 Extended = true; 1034 ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 1035 "wide.trip.count"); 1036 } else { 1037 const SCEV *SExtTrunc = 1038 SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 1039 if (SExtTrunc == IV) { 1040 Extended = true; 1041 ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 1042 "wide.trip.count"); 1043 } 1044 } 1045 1046 if (Extended) { 1047 bool Discard; 1048 L->makeLoopInvariant(ExitCnt, Discard); 1049 } else 1050 CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 1051 "lftr.wideiv"); 1052 } 1053 LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 1054 << " LHS:" << *CmpIndVar << '\n' 1055 << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 1056 << "\n" 1057 << " RHS:\t" << *ExitCnt << "\n" 1058 << "ExitCount:\t" << *ExitCount << "\n" 1059 << " was: " << *BI->getCondition() << "\n"); 1060 1061 Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 1062 Value *OrigCond = BI->getCondition(); 1063 // It's tempting to use replaceAllUsesWith here to fully replace the old 1064 // comparison, but that's not immediately safe, since users of the old 1065 // comparison may not be dominated by the new comparison. Instead, just 1066 // update the branch to use the new comparison; in the common case this 1067 // will make old comparison dead. 1068 BI->setCondition(Cond); 1069 DeadInsts.emplace_back(OrigCond); 1070 1071 ++NumLFTR; 1072 return true; 1073 } 1074 1075 //===----------------------------------------------------------------------===// 1076 // sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 1077 //===----------------------------------------------------------------------===// 1078 1079 /// If there's a single exit block, sink any loop-invariant values that 1080 /// were defined in the preheader but not used inside the loop into the 1081 /// exit block to reduce register pressure in the loop. 1082 bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 1083 BasicBlock *ExitBlock = L->getExitBlock(); 1084 if (!ExitBlock) return false; 1085 1086 BasicBlock *Preheader = L->getLoopPreheader(); 1087 if (!Preheader) return false; 1088 1089 bool MadeAnyChanges = false; 1090 BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); 1091 BasicBlock::iterator I(Preheader->getTerminator()); 1092 while (I != Preheader->begin()) { 1093 --I; 1094 // New instructions were inserted at the end of the preheader. 1095 if (isa<PHINode>(I)) 1096 break; 1097 1098 // Don't move instructions which might have side effects, since the side 1099 // effects need to complete before instructions inside the loop. Also don't 1100 // move instructions which might read memory, since the loop may modify 1101 // memory. Note that it's okay if the instruction might have undefined 1102 // behavior: LoopSimplify guarantees that the preheader dominates the exit 1103 // block. 1104 if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 1105 continue; 1106 1107 // Skip debug info intrinsics. 1108 if (isa<DbgInfoIntrinsic>(I)) 1109 continue; 1110 1111 // Skip eh pad instructions. 1112 if (I->isEHPad()) 1113 continue; 1114 1115 // Don't sink alloca: we never want to sink static alloca's out of the 1116 // entry block, and correctly sinking dynamic alloca's requires 1117 // checks for stacksave/stackrestore intrinsics. 1118 // FIXME: Refactor this check somehow? 1119 if (isa<AllocaInst>(I)) 1120 continue; 1121 1122 // Determine if there is a use in or before the loop (direct or 1123 // otherwise). 1124 bool UsedInLoop = false; 1125 for (Use &U : I->uses()) { 1126 Instruction *User = cast<Instruction>(U.getUser()); 1127 BasicBlock *UseBB = User->getParent(); 1128 if (PHINode *P = dyn_cast<PHINode>(User)) { 1129 unsigned i = 1130 PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 1131 UseBB = P->getIncomingBlock(i); 1132 } 1133 if (UseBB == Preheader || L->contains(UseBB)) { 1134 UsedInLoop = true; 1135 break; 1136 } 1137 } 1138 1139 // If there is, the def must remain in the preheader. 1140 if (UsedInLoop) 1141 continue; 1142 1143 // Otherwise, sink it to the exit block. 1144 Instruction *ToMove = &*I; 1145 bool Done = false; 1146 1147 if (I != Preheader->begin()) { 1148 // Skip debug info intrinsics. 1149 do { 1150 --I; 1151 } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); 1152 1153 if (I->isDebugOrPseudoInst() && I == Preheader->begin()) 1154 Done = true; 1155 } else { 1156 Done = true; 1157 } 1158 1159 MadeAnyChanges = true; 1160 ToMove->moveBefore(*ExitBlock, InsertPt); 1161 SE->forgetValue(ToMove); 1162 if (Done) break; 1163 InsertPt = ToMove->getIterator(); 1164 } 1165 1166 return MadeAnyChanges; 1167 } 1168 1169 static void replaceExitCond(BranchInst *BI, Value *NewCond, 1170 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1171 auto *OldCond = BI->getCondition(); 1172 LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI 1173 << " with " << *NewCond << "\n"); 1174 BI->setCondition(NewCond); 1175 if (OldCond->use_empty()) 1176 DeadInsts.emplace_back(OldCond); 1177 } 1178 1179 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB, 1180 bool IsTaken) { 1181 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1182 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1183 auto *OldCond = BI->getCondition(); 1184 return ConstantInt::get(OldCond->getType(), 1185 IsTaken ? ExitIfTrue : !ExitIfTrue); 1186 } 1187 1188 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, 1189 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1190 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1191 auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken); 1192 replaceExitCond(BI, NewCond, DeadInsts); 1193 } 1194 1195 static void replaceLoopPHINodesWithPreheaderValues( 1196 LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts, 1197 ScalarEvolution &SE) { 1198 assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); 1199 auto *LoopPreheader = L->getLoopPreheader(); 1200 auto *LoopHeader = L->getHeader(); 1201 SmallVector<Instruction *> Worklist; 1202 for (auto &PN : LoopHeader->phis()) { 1203 auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); 1204 for (User *U : PN.users()) 1205 Worklist.push_back(cast<Instruction>(U)); 1206 SE.forgetValue(&PN); 1207 PN.replaceAllUsesWith(PreheaderIncoming); 1208 DeadInsts.emplace_back(&PN); 1209 } 1210 1211 // Replacing with the preheader value will often allow IV users to simplify 1212 // (especially if the preheader value is a constant). 1213 SmallPtrSet<Instruction *, 16> Visited; 1214 while (!Worklist.empty()) { 1215 auto *I = cast<Instruction>(Worklist.pop_back_val()); 1216 if (!Visited.insert(I).second) 1217 continue; 1218 1219 // Don't simplify instructions outside the loop. 1220 if (!L->contains(I)) 1221 continue; 1222 1223 Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); 1224 if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { 1225 for (User *U : I->users()) 1226 Worklist.push_back(cast<Instruction>(U)); 1227 I->replaceAllUsesWith(Res); 1228 DeadInsts.emplace_back(I); 1229 } 1230 } 1231 } 1232 1233 static Value * 1234 createInvariantCond(const Loop *L, BasicBlock *ExitingBB, 1235 const ScalarEvolution::LoopInvariantPredicate &LIP, 1236 SCEVExpander &Rewriter) { 1237 ICmpInst::Predicate InvariantPred = LIP.Pred; 1238 BasicBlock *Preheader = L->getLoopPreheader(); 1239 assert(Preheader && "Preheader doesn't exist"); 1240 Rewriter.setInsertPoint(Preheader->getTerminator()); 1241 auto *LHSV = Rewriter.expandCodeFor(LIP.LHS); 1242 auto *RHSV = Rewriter.expandCodeFor(LIP.RHS); 1243 bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 1244 if (ExitIfTrue) 1245 InvariantPred = ICmpInst::getInversePredicate(InvariantPred); 1246 IRBuilder<> Builder(Preheader->getTerminator()); 1247 BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1248 return Builder.CreateICmp(InvariantPred, LHSV, RHSV, 1249 BI->getCondition()->getName()); 1250 } 1251 1252 static std::optional<Value *> 1253 createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB, 1254 const SCEV *MaxIter, bool Inverted, bool SkipLastIter, 1255 ScalarEvolution *SE, SCEVExpander &Rewriter) { 1256 ICmpInst::Predicate Pred = ICmp->getPredicate(); 1257 Value *LHS = ICmp->getOperand(0); 1258 Value *RHS = ICmp->getOperand(1); 1259 1260 // 'LHS pred RHS' should now mean that we stay in loop. 1261 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1262 if (Inverted) 1263 Pred = CmpInst::getInversePredicate(Pred); 1264 1265 const SCEV *LHSS = SE->getSCEVAtScope(LHS, L); 1266 const SCEV *RHSS = SE->getSCEVAtScope(RHS, L); 1267 // Can we prove it to be trivially true or false? 1268 if (auto EV = SE->evaluatePredicateAt(Pred, LHSS, RHSS, BI)) 1269 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ !*EV); 1270 1271 auto *ARTy = LHSS->getType(); 1272 auto *MaxIterTy = MaxIter->getType(); 1273 // If possible, adjust types. 1274 if (SE->getTypeSizeInBits(ARTy) > SE->getTypeSizeInBits(MaxIterTy)) 1275 MaxIter = SE->getZeroExtendExpr(MaxIter, ARTy); 1276 else if (SE->getTypeSizeInBits(ARTy) < SE->getTypeSizeInBits(MaxIterTy)) { 1277 const SCEV *MinusOne = SE->getMinusOne(ARTy); 1278 auto *MaxAllowedIter = SE->getZeroExtendExpr(MinusOne, MaxIterTy); 1279 if (SE->isKnownPredicateAt(ICmpInst::ICMP_ULE, MaxIter, MaxAllowedIter, BI)) 1280 MaxIter = SE->getTruncateExpr(MaxIter, ARTy); 1281 } 1282 1283 if (SkipLastIter) { 1284 // Semantically skip last iter is "subtract 1, do not bother about unsigned 1285 // wrap". getLoopInvariantExitCondDuringFirstIterations knows how to deal 1286 // with umin in a smart way, but umin(a, b) - 1 will likely not simplify. 1287 // So we manually construct umin(a - 1, b - 1). 1288 SmallVector<const SCEV *, 4> Elements; 1289 if (auto *UMin = dyn_cast<SCEVUMinExpr>(MaxIter)) { 1290 for (auto *Op : UMin->operands()) 1291 Elements.push_back(SE->getMinusSCEV(Op, SE->getOne(Op->getType()))); 1292 MaxIter = SE->getUMinFromMismatchedTypes(Elements); 1293 } else 1294 MaxIter = SE->getMinusSCEV(MaxIter, SE->getOne(MaxIter->getType())); 1295 } 1296 1297 // Check if there is a loop-invariant predicate equivalent to our check. 1298 auto LIP = SE->getLoopInvariantExitCondDuringFirstIterations(Pred, LHSS, RHSS, 1299 L, BI, MaxIter); 1300 if (!LIP) 1301 return std::nullopt; 1302 1303 // Can we prove it to be trivially true? 1304 if (SE->isKnownPredicateAt(LIP->Pred, LIP->LHS, LIP->RHS, BI)) 1305 return createFoldedExitCond(L, ExitingBB, /*IsTaken*/ false); 1306 else 1307 return createInvariantCond(L, ExitingBB, *LIP, Rewriter); 1308 } 1309 1310 static bool optimizeLoopExitWithUnknownExitCount( 1311 const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter, 1312 bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter, 1313 SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 1314 assert( 1315 (L->contains(BI->getSuccessor(0)) != L->contains(BI->getSuccessor(1))) && 1316 "Not a loop exit!"); 1317 1318 // For branch that stays in loop by TRUE condition, go through AND. For branch 1319 // that stays in loop by FALSE condition, go through OR. Both gives the 1320 // similar logic: "stay in loop iff all conditions are true(false)". 1321 bool Inverted = L->contains(BI->getSuccessor(1)); 1322 SmallVector<ICmpInst *, 4> LeafConditions; 1323 SmallVector<Value *, 4> Worklist; 1324 SmallPtrSet<Value *, 4> Visited; 1325 Value *OldCond = BI->getCondition(); 1326 Visited.insert(OldCond); 1327 Worklist.push_back(OldCond); 1328 1329 auto GoThrough = [&](Value *V) { 1330 Value *LHS = nullptr, *RHS = nullptr; 1331 if (Inverted) { 1332 if (!match(V, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) 1333 return false; 1334 } else { 1335 if (!match(V, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) 1336 return false; 1337 } 1338 if (Visited.insert(LHS).second) 1339 Worklist.push_back(LHS); 1340 if (Visited.insert(RHS).second) 1341 Worklist.push_back(RHS); 1342 return true; 1343 }; 1344 1345 do { 1346 Value *Curr = Worklist.pop_back_val(); 1347 // Go through AND/OR conditions. Collect leaf ICMPs. We only care about 1348 // those with one use, to avoid instruction duplication. 1349 if (Curr->hasOneUse()) 1350 if (!GoThrough(Curr)) 1351 if (auto *ICmp = dyn_cast<ICmpInst>(Curr)) 1352 LeafConditions.push_back(ICmp); 1353 } while (!Worklist.empty()); 1354 1355 // If the current basic block has the same exit count as the whole loop, and 1356 // it consists of multiple icmp's, try to collect all icmp's that give exact 1357 // same exit count. For all other icmp's, we could use one less iteration, 1358 // because their value on the last iteration doesn't really matter. 1359 SmallPtrSet<ICmpInst *, 4> ICmpsFailingOnLastIter; 1360 if (!SkipLastIter && LeafConditions.size() > 1 && 1361 SE->getExitCount(L, ExitingBB, 1362 ScalarEvolution::ExitCountKind::SymbolicMaximum) == 1363 MaxIter) 1364 for (auto *ICmp : LeafConditions) { 1365 auto EL = SE->computeExitLimitFromCond(L, ICmp, Inverted, 1366 /*ControlsExit*/ false); 1367 auto *ExitMax = EL.SymbolicMaxNotTaken; 1368 if (isa<SCEVCouldNotCompute>(ExitMax)) 1369 continue; 1370 // They could be of different types (specifically this happens after 1371 // IV widening). 1372 auto *WiderType = 1373 SE->getWiderType(ExitMax->getType(), MaxIter->getType()); 1374 auto *WideExitMax = SE->getNoopOrZeroExtend(ExitMax, WiderType); 1375 auto *WideMaxIter = SE->getNoopOrZeroExtend(MaxIter, WiderType); 1376 if (WideExitMax == WideMaxIter) 1377 ICmpsFailingOnLastIter.insert(ICmp); 1378 } 1379 1380 bool Changed = false; 1381 for (auto *OldCond : LeafConditions) { 1382 // Skip last iteration for this icmp under one of two conditions: 1383 // - We do it for all conditions; 1384 // - There is another ICmp that would fail on last iter, so this one doesn't 1385 // really matter. 1386 bool OptimisticSkipLastIter = SkipLastIter; 1387 if (!OptimisticSkipLastIter) { 1388 if (ICmpsFailingOnLastIter.size() > 1) 1389 OptimisticSkipLastIter = true; 1390 else if (ICmpsFailingOnLastIter.size() == 1) 1391 OptimisticSkipLastIter = !ICmpsFailingOnLastIter.count(OldCond); 1392 } 1393 if (auto Replaced = 1394 createReplacement(OldCond, L, ExitingBB, MaxIter, Inverted, 1395 OptimisticSkipLastIter, SE, Rewriter)) { 1396 Changed = true; 1397 auto *NewCond = *Replaced; 1398 if (auto *NCI = dyn_cast<Instruction>(NewCond)) { 1399 NCI->setName(OldCond->getName() + ".first_iter"); 1400 } 1401 LLVM_DEBUG(dbgs() << "Unknown exit count: Replacing " << *OldCond 1402 << " with " << *NewCond << "\n"); 1403 assert(OldCond->hasOneUse() && "Must be!"); 1404 OldCond->replaceAllUsesWith(NewCond); 1405 DeadInsts.push_back(OldCond); 1406 // Make sure we no longer consider this condition as failing on last 1407 // iteration. 1408 ICmpsFailingOnLastIter.erase(OldCond); 1409 } 1410 } 1411 return Changed; 1412 } 1413 1414 bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { 1415 // Note: This is duplicating a particular part on SimplifyIndVars reasoning. 1416 // We need to duplicate it because given icmp zext(small-iv), C, IVUsers 1417 // never reaches the icmp since the zext doesn't fold to an AddRec unless 1418 // it already has flags. The alternative to this would be to extending the 1419 // set of "interesting" IV users to include the icmp, but doing that 1420 // regresses results in practice by querying SCEVs before trip counts which 1421 // rely on them which results in SCEV caching sub-optimal answers. The 1422 // concern about caching sub-optimal results is why we only query SCEVs of 1423 // the loop invariant RHS here. 1424 SmallVector<BasicBlock*, 16> ExitingBlocks; 1425 L->getExitingBlocks(ExitingBlocks); 1426 bool Changed = false; 1427 for (auto *ExitingBB : ExitingBlocks) { 1428 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1429 if (!BI) 1430 continue; 1431 assert(BI->isConditional() && "exit branch must be conditional"); 1432 1433 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1434 if (!ICmp || !ICmp->hasOneUse()) 1435 continue; 1436 1437 auto *LHS = ICmp->getOperand(0); 1438 auto *RHS = ICmp->getOperand(1); 1439 // For the range reasoning, avoid computing SCEVs in the loop to avoid 1440 // poisoning cache with sub-optimal results. For the must-execute case, 1441 // this is a neccessary precondition for correctness. 1442 if (!L->isLoopInvariant(RHS)) { 1443 if (!L->isLoopInvariant(LHS)) 1444 continue; 1445 // Same logic applies for the inverse case 1446 std::swap(LHS, RHS); 1447 } 1448 1449 // Match (icmp signed-cond zext, RHS) 1450 Value *LHSOp = nullptr; 1451 if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned()) 1452 continue; 1453 1454 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1455 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1456 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1457 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1458 FullCR = FullCR.zeroExtend(OuterBitWidth); 1459 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1460 if (FullCR.contains(RHSCR)) { 1461 // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus 1462 // replace the signed condition with the unsigned version. 1463 ICmp->setPredicate(ICmp->getUnsignedPredicate()); 1464 Changed = true; 1465 // Note: No SCEV invalidation needed. We've changed the predicate, but 1466 // have not changed exit counts, or the values produced by the compare. 1467 continue; 1468 } 1469 } 1470 1471 // Now that we've canonicalized the condition to match the extend, 1472 // see if we can rotate the extend out of the loop. 1473 for (auto *ExitingBB : ExitingBlocks) { 1474 auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1475 if (!BI) 1476 continue; 1477 assert(BI->isConditional() && "exit branch must be conditional"); 1478 1479 auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 1480 if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned()) 1481 continue; 1482 1483 bool Swapped = false; 1484 auto *LHS = ICmp->getOperand(0); 1485 auto *RHS = ICmp->getOperand(1); 1486 if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS)) 1487 // Nothing to rotate 1488 continue; 1489 if (L->isLoopInvariant(LHS)) { 1490 // Same logic applies for the inverse case until we actually pick 1491 // which operand of the compare to update. 1492 Swapped = true; 1493 std::swap(LHS, RHS); 1494 } 1495 assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS)); 1496 1497 // Match (icmp unsigned-cond zext, RHS) 1498 // TODO: Extend to handle corresponding sext/signed-cmp case 1499 // TODO: Extend to other invertible functions 1500 Value *LHSOp = nullptr; 1501 if (!match(LHS, m_ZExt(m_Value(LHSOp)))) 1502 continue; 1503 1504 // In general, we only rotate if we can do so without increasing the number 1505 // of instructions. The exception is when we have an zext(add-rec). The 1506 // reason for allowing this exception is that we know we need to get rid 1507 // of the zext for SCEV to be able to compute a trip count for said loops; 1508 // we consider the new trip count valuable enough to increase instruction 1509 // count by one. 1510 if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp))) 1511 continue; 1512 1513 // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS 1514 // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS) 1515 // when zext is loop varying and RHS is loop invariant. This converts 1516 // loop varying work to loop-invariant work. 1517 auto doRotateTransform = [&]() { 1518 assert(ICmp->isUnsigned() && "must have proven unsigned already"); 1519 auto *NewRHS = 1520 CastInst::Create(Instruction::Trunc, RHS, LHSOp->getType(), "", 1521 L->getLoopPreheader()->getTerminator()); 1522 ICmp->setOperand(Swapped ? 1 : 0, LHSOp); 1523 ICmp->setOperand(Swapped ? 0 : 1, NewRHS); 1524 if (LHS->use_empty()) 1525 DeadInsts.push_back(LHS); 1526 }; 1527 1528 1529 const DataLayout &DL = ExitingBB->getModule()->getDataLayout(); 1530 const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType()); 1531 const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType()); 1532 auto FullCR = ConstantRange::getFull(InnerBitWidth); 1533 FullCR = FullCR.zeroExtend(OuterBitWidth); 1534 auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L)); 1535 if (FullCR.contains(RHSCR)) { 1536 doRotateTransform(); 1537 Changed = true; 1538 // Note, we are leaving SCEV in an unfortunately imprecise case here 1539 // as rotation tends to reveal information about trip counts not 1540 // previously visible. 1541 continue; 1542 } 1543 } 1544 1545 return Changed; 1546 } 1547 1548 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 1549 SmallVector<BasicBlock*, 16> ExitingBlocks; 1550 L->getExitingBlocks(ExitingBlocks); 1551 1552 // Remove all exits which aren't both rewriteable and execute on every 1553 // iteration. 1554 llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1555 // If our exitting block exits multiple loops, we can only rewrite the 1556 // innermost one. Otherwise, we're changing how many times the innermost 1557 // loop runs before it exits. 1558 if (LI->getLoopFor(ExitingBB) != L) 1559 return true; 1560 1561 // Can't rewrite non-branch yet. 1562 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1563 if (!BI) 1564 return true; 1565 1566 // Likewise, the loop latch must be dominated by the exiting BB. 1567 if (!DT->dominates(ExitingBB, L->getLoopLatch())) 1568 return true; 1569 1570 if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { 1571 // If already constant, nothing to do. However, if this is an 1572 // unconditional exit, we can still replace header phis with their 1573 // preheader value. 1574 if (!L->contains(BI->getSuccessor(CI->isNullValue()))) 1575 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1576 return true; 1577 } 1578 1579 return false; 1580 }); 1581 1582 if (ExitingBlocks.empty()) 1583 return false; 1584 1585 // Get a symbolic upper bound on the loop backedge taken count. 1586 const SCEV *MaxBECount = SE->getSymbolicMaxBackedgeTakenCount(L); 1587 if (isa<SCEVCouldNotCompute>(MaxBECount)) 1588 return false; 1589 1590 // Visit our exit blocks in order of dominance. We know from the fact that 1591 // all exits must dominate the latch, so there is a total dominance order 1592 // between them. 1593 llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) { 1594 // std::sort sorts in ascending order, so we want the inverse of 1595 // the normal dominance relation. 1596 if (A == B) return false; 1597 if (DT->properlyDominates(A, B)) 1598 return true; 1599 else { 1600 assert(DT->properlyDominates(B, A) && 1601 "expected total dominance order!"); 1602 return false; 1603 } 1604 }); 1605 #ifdef ASSERT 1606 for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 1607 assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 1608 } 1609 #endif 1610 1611 bool Changed = false; 1612 bool SkipLastIter = false; 1613 const SCEV *CurrMaxExit = SE->getCouldNotCompute(); 1614 auto UpdateSkipLastIter = [&](const SCEV *MaxExitCount) { 1615 if (SkipLastIter || isa<SCEVCouldNotCompute>(MaxExitCount)) 1616 return; 1617 if (isa<SCEVCouldNotCompute>(CurrMaxExit)) 1618 CurrMaxExit = MaxExitCount; 1619 else 1620 CurrMaxExit = SE->getUMinFromMismatchedTypes(CurrMaxExit, MaxExitCount); 1621 // If the loop has more than 1 iteration, all further checks will be 1622 // executed 1 iteration less. 1623 if (CurrMaxExit == MaxBECount) 1624 SkipLastIter = true; 1625 }; 1626 SmallSet<const SCEV *, 8> DominatingExactExitCounts; 1627 for (BasicBlock *ExitingBB : ExitingBlocks) { 1628 const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); 1629 const SCEV *MaxExitCount = SE->getExitCount( 1630 L, ExitingBB, ScalarEvolution::ExitCountKind::SymbolicMaximum); 1631 if (isa<SCEVCouldNotCompute>(ExactExitCount)) { 1632 // Okay, we do not know the exit count here. Can we at least prove that it 1633 // will remain the same within iteration space? 1634 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1635 auto OptimizeCond = [&](bool SkipLastIter) { 1636 return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB, 1637 MaxBECount, SkipLastIter, 1638 SE, Rewriter, DeadInsts); 1639 }; 1640 1641 // TODO: We might have proved that we can skip the last iteration for 1642 // this check. In this case, we only want to check the condition on the 1643 // pre-last iteration (MaxBECount - 1). However, there is a nasty 1644 // corner case: 1645 // 1646 // for (i = len; i != 0; i--) { ... check (i ult X) ... } 1647 // 1648 // If we could not prove that len != 0, then we also could not prove that 1649 // (len - 1) is not a UINT_MAX. If we simply query (len - 1), then 1650 // OptimizeCond will likely not prove anything for it, even if it could 1651 // prove the same fact for len. 1652 // 1653 // As a temporary solution, we query both last and pre-last iterations in 1654 // hope that we will be able to prove triviality for at least one of 1655 // them. We can stop querying MaxBECount for this case once SCEV 1656 // understands that (MaxBECount - 1) will not overflow here. 1657 if (OptimizeCond(false)) 1658 Changed = true; 1659 else if (SkipLastIter && OptimizeCond(true)) 1660 Changed = true; 1661 UpdateSkipLastIter(MaxExitCount); 1662 continue; 1663 } 1664 1665 UpdateSkipLastIter(ExactExitCount); 1666 1667 // If we know we'd exit on the first iteration, rewrite the exit to 1668 // reflect this. This does not imply the loop must exit through this 1669 // exit; there may be an earlier one taken on the first iteration. 1670 // We know that the backedge can't be taken, so we replace all 1671 // the header PHIs with values coming from the preheader. 1672 if (ExactExitCount->isZero()) { 1673 foldExit(L, ExitingBB, true, DeadInsts); 1674 replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts, *SE); 1675 Changed = true; 1676 continue; 1677 } 1678 1679 assert(ExactExitCount->getType()->isIntegerTy() && 1680 MaxBECount->getType()->isIntegerTy() && 1681 "Exit counts must be integers"); 1682 1683 Type *WiderType = 1684 SE->getWiderType(MaxBECount->getType(), ExactExitCount->getType()); 1685 ExactExitCount = SE->getNoopOrZeroExtend(ExactExitCount, WiderType); 1686 MaxBECount = SE->getNoopOrZeroExtend(MaxBECount, WiderType); 1687 assert(MaxBECount->getType() == ExactExitCount->getType()); 1688 1689 // Can we prove that some other exit must be taken strictly before this 1690 // one? 1691 if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxBECount, 1692 ExactExitCount)) { 1693 foldExit(L, ExitingBB, false, DeadInsts); 1694 Changed = true; 1695 continue; 1696 } 1697 1698 // As we run, keep track of which exit counts we've encountered. If we 1699 // find a duplicate, we've found an exit which would have exited on the 1700 // exiting iteration, but (from the visit order) strictly follows another 1701 // which does the same and is thus dead. 1702 if (!DominatingExactExitCounts.insert(ExactExitCount).second) { 1703 foldExit(L, ExitingBB, false, DeadInsts); 1704 Changed = true; 1705 continue; 1706 } 1707 1708 // TODO: There might be another oppurtunity to leverage SCEV's reasoning 1709 // here. If we kept track of the min of dominanting exits so far, we could 1710 // discharge exits with EC >= MDEC. This is less powerful than the existing 1711 // transform (since later exits aren't considered), but potentially more 1712 // powerful for any case where SCEV can prove a >=u b, but neither a == b 1713 // or a >u b. Such a case is not currently known. 1714 } 1715 return Changed; 1716 } 1717 1718 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 1719 SmallVector<BasicBlock*, 16> ExitingBlocks; 1720 L->getExitingBlocks(ExitingBlocks); 1721 1722 // Finally, see if we can rewrite our exit conditions into a loop invariant 1723 // form. If we have a read-only loop, and we can tell that we must exit down 1724 // a path which does not need any of the values computed within the loop, we 1725 // can rewrite the loop to exit on the first iteration. Note that this 1726 // doesn't either a) tell us the loop exits on the first iteration (unless 1727 // *all* exits are predicateable) or b) tell us *which* exit might be taken. 1728 // This transformation looks a lot like a restricted form of dead loop 1729 // elimination, but restricted to read-only loops and without neccesssarily 1730 // needing to kill the loop entirely. 1731 if (!LoopPredication) 1732 return false; 1733 1734 // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 1735 // through *explicit* control flow. We have to eliminate the possibility of 1736 // implicit exits (see below) before we know it's truly exact. 1737 const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 1738 if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC)) 1739 return false; 1740 1741 assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); 1742 assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); 1743 1744 auto BadExit = [&](BasicBlock *ExitingBB) { 1745 // If our exiting block exits multiple loops, we can only rewrite the 1746 // innermost one. Otherwise, we're changing how many times the innermost 1747 // loop runs before it exits. 1748 if (LI->getLoopFor(ExitingBB) != L) 1749 return true; 1750 1751 // Can't rewrite non-branch yet. 1752 BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 1753 if (!BI) 1754 return true; 1755 1756 // If already constant, nothing to do. 1757 if (isa<Constant>(BI->getCondition())) 1758 return true; 1759 1760 // If the exit block has phis, we need to be able to compute the values 1761 // within the loop which contains them. This assumes trivially lcssa phis 1762 // have already been removed; TODO: generalize 1763 BasicBlock *ExitBlock = 1764 BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 1765 if (!ExitBlock->phis().empty()) 1766 return true; 1767 1768 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1769 if (isa<SCEVCouldNotCompute>(ExitCount) || 1770 !Rewriter.isSafeToExpand(ExitCount)) 1771 return true; 1772 1773 assert(SE->isLoopInvariant(ExitCount, L) && 1774 "Exit count must be loop invariant"); 1775 assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); 1776 return false; 1777 }; 1778 1779 // If we have any exits which can't be predicated themselves, than we can't 1780 // predicate any exit which isn't guaranteed to execute before it. Consider 1781 // two exits (a) and (b) which would both exit on the same iteration. If we 1782 // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 1783 // we could convert a loop from exiting through (a) to one exiting through 1784 // (b). Note that this problem exists only for exits with the same exit 1785 // count, and we could be more aggressive when exit counts are known inequal. 1786 llvm::sort(ExitingBlocks, 1787 [&](BasicBlock *A, BasicBlock *B) { 1788 // std::sort sorts in ascending order, so we want the inverse of 1789 // the normal dominance relation, plus a tie breaker for blocks 1790 // unordered by dominance. 1791 if (DT->properlyDominates(A, B)) return true; 1792 if (DT->properlyDominates(B, A)) return false; 1793 return A->getName() < B->getName(); 1794 }); 1795 // Check to see if our exit blocks are a total order (i.e. a linear chain of 1796 // exits before the backedge). If they aren't, reasoning about reachability 1797 // is complicated and we choose not to for now. 1798 for (unsigned i = 1; i < ExitingBlocks.size(); i++) 1799 if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) 1800 return false; 1801 1802 // Given our sorted total order, we know that exit[j] must be evaluated 1803 // after all exit[i] such j > i. 1804 for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 1805 if (BadExit(ExitingBlocks[i])) { 1806 ExitingBlocks.resize(i); 1807 break; 1808 } 1809 1810 if (ExitingBlocks.empty()) 1811 return false; 1812 1813 // We rely on not being able to reach an exiting block on a later iteration 1814 // then it's statically compute exit count. The implementaton of 1815 // getExitCount currently has this invariant, but assert it here so that 1816 // breakage is obvious if this ever changes.. 1817 assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { 1818 return DT->dominates(ExitingBB, L->getLoopLatch()); 1819 })); 1820 1821 // At this point, ExitingBlocks consists of only those blocks which are 1822 // predicatable. Given that, we know we have at least one exit we can 1823 // predicate if the loop is doesn't have side effects and doesn't have any 1824 // implicit exits (because then our exact BTC isn't actually exact). 1825 // @Reviewers - As structured, this is O(I^2) for loop nests. Any 1826 // suggestions on how to improve this? I can obviously bail out for outer 1827 // loops, but that seems less than ideal. MemorySSA can find memory writes, 1828 // is that enough for *all* side effects? 1829 for (BasicBlock *BB : L->blocks()) 1830 for (auto &I : *BB) 1831 // TODO:isGuaranteedToTransfer 1832 if (I.mayHaveSideEffects()) 1833 return false; 1834 1835 bool Changed = false; 1836 // Finally, do the actual predication for all predicatable blocks. A couple 1837 // of notes here: 1838 // 1) We don't bother to constant fold dominated exits with identical exit 1839 // counts; that's simply a form of CSE/equality propagation and we leave 1840 // it for dedicated passes. 1841 // 2) We insert the comparison at the branch. Hoisting introduces additional 1842 // legality constraints and we leave that to dedicated logic. We want to 1843 // predicate even if we can't insert a loop invariant expression as 1844 // peeling or unrolling will likely reduce the cost of the otherwise loop 1845 // varying check. 1846 Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 1847 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 1848 Value *ExactBTCV = nullptr; // Lazily generated if needed. 1849 for (BasicBlock *ExitingBB : ExitingBlocks) { 1850 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1851 1852 auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 1853 Value *NewCond; 1854 if (ExitCount == ExactBTC) { 1855 NewCond = L->contains(BI->getSuccessor(0)) ? 1856 B.getFalse() : B.getTrue(); 1857 } else { 1858 Value *ECV = Rewriter.expandCodeFor(ExitCount); 1859 if (!ExactBTCV) 1860 ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 1861 Value *RHS = ExactBTCV; 1862 if (ECV->getType() != RHS->getType()) { 1863 Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 1864 ECV = B.CreateZExt(ECV, WiderTy); 1865 RHS = B.CreateZExt(RHS, WiderTy); 1866 } 1867 auto Pred = L->contains(BI->getSuccessor(0)) ? 1868 ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 1869 NewCond = B.CreateICmp(Pred, ECV, RHS); 1870 } 1871 Value *OldCond = BI->getCondition(); 1872 BI->setCondition(NewCond); 1873 if (OldCond->use_empty()) 1874 DeadInsts.emplace_back(OldCond); 1875 Changed = true; 1876 } 1877 1878 return Changed; 1879 } 1880 1881 //===----------------------------------------------------------------------===// 1882 // IndVarSimplify driver. Manage several subpasses of IV simplification. 1883 //===----------------------------------------------------------------------===// 1884 1885 bool IndVarSimplify::run(Loop *L) { 1886 // We need (and expect!) the incoming loop to be in LCSSA. 1887 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 1888 "LCSSA required to run indvars!"); 1889 1890 // If LoopSimplify form is not available, stay out of trouble. Some notes: 1891 // - LSR currently only supports LoopSimplify-form loops. Indvars' 1892 // canonicalization can be a pessimization without LSR to "clean up" 1893 // afterwards. 1894 // - We depend on having a preheader; in particular, 1895 // Loop::getCanonicalInductionVariable only supports loops with preheaders, 1896 // and we're in trouble if we can't find the induction variable even when 1897 // we've manually inserted one. 1898 // - LFTR relies on having a single backedge. 1899 if (!L->isLoopSimplifyForm()) 1900 return false; 1901 1902 bool Changed = false; 1903 // If there are any floating-point recurrences, attempt to 1904 // transform them to use integer recurrences. 1905 Changed |= rewriteNonIntegerIVs(L); 1906 1907 // Create a rewriter object which we'll use to transform the code with. 1908 SCEVExpander Rewriter(*SE, DL, "indvars"); 1909 #ifndef NDEBUG 1910 Rewriter.setDebugType(DEBUG_TYPE); 1911 #endif 1912 1913 // Eliminate redundant IV users. 1914 // 1915 // Simplification works best when run before other consumers of SCEV. We 1916 // attempt to avoid evaluating SCEVs for sign/zero extend operations until 1917 // other expressions involving loop IVs have been evaluated. This helps SCEV 1918 // set no-wrap flags before normalizing sign/zero extension. 1919 Rewriter.disableCanonicalMode(); 1920 Changed |= simplifyAndExtend(L, Rewriter, LI); 1921 1922 // Check to see if we can compute the final value of any expressions 1923 // that are recurrent in the loop, and substitute the exit values from the 1924 // loop into any instructions outside of the loop that use the final values 1925 // of the current expressions. 1926 if (ReplaceExitValue != NeverRepl) { 1927 if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 1928 ReplaceExitValue, DeadInsts)) { 1929 NumReplaced += Rewrites; 1930 Changed = true; 1931 } 1932 } 1933 1934 // Eliminate redundant IV cycles. 1935 NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); 1936 1937 // Try to convert exit conditions to unsigned and rotate computation 1938 // out of the loop. Note: Handles invalidation internally if needed. 1939 Changed |= canonicalizeExitCondition(L); 1940 1941 // Try to eliminate loop exits based on analyzeable exit counts 1942 if (optimizeLoopExits(L, Rewriter)) { 1943 Changed = true; 1944 // Given we've changed exit counts, notify SCEV 1945 // Some nested loops may share same folded exit basic block, 1946 // thus we need to notify top most loop. 1947 SE->forgetTopmostLoop(L); 1948 } 1949 1950 // Try to form loop invariant tests for loop exits by changing how many 1951 // iterations of the loop run when that is unobservable. 1952 if (predicateLoopExits(L, Rewriter)) { 1953 Changed = true; 1954 // Given we've changed exit counts, notify SCEV 1955 SE->forgetLoop(L); 1956 } 1957 1958 // If we have a trip count expression, rewrite the loop's exit condition 1959 // using it. 1960 if (!DisableLFTR) { 1961 BasicBlock *PreHeader = L->getLoopPreheader(); 1962 1963 SmallVector<BasicBlock*, 16> ExitingBlocks; 1964 L->getExitingBlocks(ExitingBlocks); 1965 for (BasicBlock *ExitingBB : ExitingBlocks) { 1966 // Can't rewrite non-branch yet. 1967 if (!isa<BranchInst>(ExitingBB->getTerminator())) 1968 continue; 1969 1970 // If our exitting block exits multiple loops, we can only rewrite the 1971 // innermost one. Otherwise, we're changing how many times the innermost 1972 // loop runs before it exits. 1973 if (LI->getLoopFor(ExitingBB) != L) 1974 continue; 1975 1976 if (!needsLFTR(L, ExitingBB)) 1977 continue; 1978 1979 const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 1980 if (isa<SCEVCouldNotCompute>(ExitCount)) 1981 continue; 1982 1983 // This was handled above, but as we form SCEVs, we can sometimes refine 1984 // existing ones; this allows exit counts to be folded to zero which 1985 // weren't when optimizeLoopExits saw them. Arguably, we should iterate 1986 // until stable to handle cases like this better. 1987 if (ExitCount->isZero()) 1988 continue; 1989 1990 PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 1991 if (!IndVar) 1992 continue; 1993 1994 // Avoid high cost expansions. Note: This heuristic is questionable in 1995 // that our definition of "high cost" is not exactly principled. 1996 if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 1997 TTI, PreHeader->getTerminator())) 1998 continue; 1999 2000 if (!Rewriter.isSafeToExpand(ExitCount)) 2001 continue; 2002 2003 Changed |= linearFunctionTestReplace(L, ExitingBB, 2004 ExitCount, IndVar, 2005 Rewriter); 2006 } 2007 } 2008 // Clear the rewriter cache, because values that are in the rewriter's cache 2009 // can be deleted in the loop below, causing the AssertingVH in the cache to 2010 // trigger. 2011 Rewriter.clear(); 2012 2013 // Now that we're done iterating through lists, clean up any instructions 2014 // which are now dead. 2015 while (!DeadInsts.empty()) { 2016 Value *V = DeadInsts.pop_back_val(); 2017 2018 if (PHINode *PHI = dyn_cast_or_null<PHINode>(V)) 2019 Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get()); 2020 else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 2021 Changed |= 2022 RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 2023 } 2024 2025 // The Rewriter may not be used from this point on. 2026 2027 // Loop-invariant instructions in the preheader that aren't used in the 2028 // loop may be sunk below the loop to reduce register pressure. 2029 Changed |= sinkUnusedInvariants(L); 2030 2031 // rewriteFirstIterationLoopExitValues does not rely on the computation of 2032 // trip count and therefore can further simplify exit values in addition to 2033 // rewriteLoopExitValues. 2034 Changed |= rewriteFirstIterationLoopExitValues(L); 2035 2036 // Clean up dead instructions. 2037 Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 2038 2039 // Check a post-condition. 2040 assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 2041 "Indvars did not preserve LCSSA!"); 2042 if (VerifyMemorySSA && MSSAU) 2043 MSSAU->getMemorySSA()->verifyMemorySSA(); 2044 2045 return Changed; 2046 } 2047 2048 PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 2049 LoopStandardAnalysisResults &AR, 2050 LPMUpdater &) { 2051 Function *F = L.getHeader()->getParent(); 2052 const DataLayout &DL = F->getParent()->getDataLayout(); 2053 2054 IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, 2055 WidenIndVars && AllowIVWidening); 2056 if (!IVS.run(&L)) 2057 return PreservedAnalyses::all(); 2058 2059 auto PA = getLoopPassPreservedAnalyses(); 2060 PA.preserveSet<CFGAnalyses>(); 2061 if (AR.MSSA) 2062 PA.preserve<MemorySSAAnalysis>(); 2063 return PA; 2064 } 2065