1 ///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 10 #include "llvm/ADT/DenseMap.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/ADT/Sequence.h" 13 #include "llvm/ADT/SetVector.h" 14 #include "llvm/ADT/SmallPtrSet.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/Analysis/AssumptionCache.h" 19 #include "llvm/Analysis/BlockFrequencyInfo.h" 20 #include "llvm/Analysis/CFG.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GuardUtils.h" 24 #include "llvm/Analysis/LoopAnalysisManager.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/LoopIterator.h" 27 #include "llvm/Analysis/MemorySSA.h" 28 #include "llvm/Analysis/MemorySSAUpdater.h" 29 #include "llvm/Analysis/MustExecute.h" 30 #include "llvm/Analysis/ProfileSummaryInfo.h" 31 #include "llvm/Analysis/ScalarEvolution.h" 32 #include "llvm/Analysis/TargetTransformInfo.h" 33 #include "llvm/Analysis/ValueTracking.h" 34 #include "llvm/IR/BasicBlock.h" 35 #include "llvm/IR/Constant.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/IRBuilder.h" 40 #include "llvm/IR/InstrTypes.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicInst.h" 44 #include "llvm/IR/Module.h" 45 #include "llvm/IR/PatternMatch.h" 46 #include "llvm/IR/ProfDataUtils.h" 47 #include "llvm/IR/Use.h" 48 #include "llvm/IR/Value.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/Debug.h" 52 #include "llvm/Support/ErrorHandling.h" 53 #include "llvm/Support/GenericDomTree.h" 54 #include "llvm/Support/InstructionCost.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Transforms/Scalar/LoopPassManager.h" 57 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 58 #include "llvm/Transforms/Utils/Cloning.h" 59 #include "llvm/Transforms/Utils/Local.h" 60 #include "llvm/Transforms/Utils/LoopUtils.h" 61 #include "llvm/Transforms/Utils/ValueMapper.h" 62 #include <algorithm> 63 #include <cassert> 64 #include <iterator> 65 #include <numeric> 66 #include <optional> 67 #include <utility> 68 69 #define DEBUG_TYPE "simple-loop-unswitch" 70 71 using namespace llvm; 72 using namespace llvm::PatternMatch; 73 74 STATISTIC(NumBranches, "Number of branches unswitched"); 75 STATISTIC(NumSwitches, "Number of switches unswitched"); 76 STATISTIC(NumSelects, "Number of selects turned into branches for unswitching"); 77 STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); 78 STATISTIC(NumTrivial, "Number of unswitches that are trivial"); 79 STATISTIC( 80 NumCostMultiplierSkipped, 81 "Number of unswitch candidates that had their cost multiplier skipped"); 82 STATISTIC(NumInvariantConditionsInjected, 83 "Number of invariant conditions injected and unswitched"); 84 85 static cl::opt<bool> EnableNonTrivialUnswitch( 86 "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, 87 cl::desc("Forcibly enables non-trivial loop unswitching rather than " 88 "following the configuration passed into the pass.")); 89 90 static cl::opt<int> 91 UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, 92 cl::desc("The cost threshold for unswitching a loop.")); 93 94 static cl::opt<bool> EnableUnswitchCostMultiplier( 95 "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden, 96 cl::desc("Enable unswitch cost multiplier that prohibits exponential " 97 "explosion in nontrivial unswitch.")); 98 static cl::opt<int> UnswitchSiblingsToplevelDiv( 99 "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, 100 cl::desc("Toplevel siblings divisor for cost multiplier.")); 101 static cl::opt<int> UnswitchNumInitialUnscaledCandidates( 102 "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, 103 cl::desc("Number of unswitch candidates that are ignored when calculating " 104 "cost multiplier.")); 105 static cl::opt<bool> UnswitchGuards( 106 "simple-loop-unswitch-guards", cl::init(true), cl::Hidden, 107 cl::desc("If enabled, simple loop unswitching will also consider " 108 "llvm.experimental.guard intrinsics as unswitch candidates.")); 109 static cl::opt<bool> DropNonTrivialImplicitNullChecks( 110 "simple-loop-unswitch-drop-non-trivial-implicit-null-checks", 111 cl::init(false), cl::Hidden, 112 cl::desc("If enabled, drop make.implicit metadata in unswitched implicit " 113 "null checks to save time analyzing if we can keep it.")); 114 static cl::opt<unsigned> 115 MSSAThreshold("simple-loop-unswitch-memoryssa-threshold", 116 cl::desc("Max number of memory uses to explore during " 117 "partial unswitching analysis"), 118 cl::init(100), cl::Hidden); 119 static cl::opt<bool> FreezeLoopUnswitchCond( 120 "freeze-loop-unswitch-cond", cl::init(true), cl::Hidden, 121 cl::desc("If enabled, the freeze instruction will be added to condition " 122 "of loop unswitch to prevent miscompilation.")); 123 124 static cl::opt<bool> InjectInvariantConditions( 125 "simple-loop-unswitch-inject-invariant-conditions", cl::Hidden, 126 cl::desc("Whether we should inject new invariants and unswitch them to " 127 "eliminate some existing (non-invariant) conditions."), 128 cl::init(true)); 129 130 static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( 131 "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", 132 cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " 133 "unswitch on them to eliminate branches that are " 134 "not-taken 1/<this option> times or less."), 135 cl::init(16)); 136 137 AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key; 138 namespace { 139 struct CompareDesc { 140 BranchInst *Term; 141 Value *Invariant; 142 BasicBlock *InLoopSucc; 143 144 CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) 145 : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} 146 }; 147 148 struct InjectedInvariant { 149 ICmpInst::Predicate Pred; 150 Value *LHS; 151 Value *RHS; 152 BasicBlock *InLoopSucc; 153 154 InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, 155 BasicBlock *InLoopSucc) 156 : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} 157 }; 158 159 struct NonTrivialUnswitchCandidate { 160 Instruction *TI = nullptr; 161 TinyPtrVector<Value *> Invariants; 162 std::optional<InstructionCost> Cost; 163 std::optional<InjectedInvariant> PendingInjection; 164 NonTrivialUnswitchCandidate( 165 Instruction *TI, ArrayRef<Value *> Invariants, 166 std::optional<InstructionCost> Cost = std::nullopt, 167 std::optional<InjectedInvariant> PendingInjection = std::nullopt) 168 : TI(TI), Invariants(Invariants), Cost(Cost), 169 PendingInjection(PendingInjection) {}; 170 171 bool hasPendingInjection() const { return PendingInjection.has_value(); } 172 }; 173 } // end anonymous namespace. 174 175 // Helper to skip (select x, true, false), which matches both a logical AND and 176 // OR and can confuse code that tries to determine if \p Cond is either a 177 // logical AND or OR but not both. 178 static Value *skipTrivialSelect(Value *Cond) { 179 Value *CondNext; 180 while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero()))) 181 Cond = CondNext; 182 return Cond; 183 } 184 185 /// Collect all of the loop invariant input values transitively used by the 186 /// homogeneous instruction graph from a given root. 187 /// 188 /// This essentially walks from a root recursively through loop variant operands 189 /// which have perform the same logical operation (AND or OR) and finds all 190 /// inputs which are loop invariant. For some operations these can be 191 /// re-associated and unswitched out of the loop entirely. 192 static TinyPtrVector<Value *> 193 collectHomogenousInstGraphLoopInvariants(const Loop &L, Instruction &Root, 194 const LoopInfo &LI) { 195 assert(!L.isLoopInvariant(&Root) && 196 "Only need to walk the graph if root itself is not invariant."); 197 TinyPtrVector<Value *> Invariants; 198 199 bool IsRootAnd = match(&Root, m_LogicalAnd()); 200 bool IsRootOr = match(&Root, m_LogicalOr()); 201 202 // Build a worklist and recurse through operators collecting invariants. 203 SmallVector<Instruction *, 4> Worklist; 204 SmallPtrSet<Instruction *, 8> Visited; 205 Worklist.push_back(&Root); 206 Visited.insert(&Root); 207 do { 208 Instruction &I = *Worklist.pop_back_val(); 209 for (Value *OpV : I.operand_values()) { 210 // Skip constants as unswitching isn't interesting for them. 211 if (isa<Constant>(OpV)) 212 continue; 213 214 // Add it to our result if loop invariant. 215 if (L.isLoopInvariant(OpV)) { 216 Invariants.push_back(OpV); 217 continue; 218 } 219 220 // If not an instruction with the same opcode, nothing we can do. 221 Instruction *OpI = dyn_cast<Instruction>(skipTrivialSelect(OpV)); 222 223 if (OpI && ((IsRootAnd && match(OpI, m_LogicalAnd())) || 224 (IsRootOr && match(OpI, m_LogicalOr())))) { 225 // Visit this operand. 226 if (Visited.insert(OpI).second) 227 Worklist.push_back(OpI); 228 } 229 } 230 } while (!Worklist.empty()); 231 232 return Invariants; 233 } 234 235 static void replaceLoopInvariantUses(const Loop &L, Value *Invariant, 236 Constant &Replacement) { 237 assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?"); 238 239 // Replace uses of LIC in the loop with the given constant. 240 // We use make_early_inc_range as set invalidates the iterator. 241 for (Use &U : llvm::make_early_inc_range(Invariant->uses())) { 242 Instruction *UserI = dyn_cast<Instruction>(U.getUser()); 243 244 // Replace this use within the loop body. 245 if (UserI && L.contains(UserI)) 246 U.set(&Replacement); 247 } 248 } 249 250 /// Check that all the LCSSA PHI nodes in the loop exit block have trivial 251 /// incoming values along this edge. 252 static bool areLoopExitPHIsLoopInvariant(const Loop &L, 253 const BasicBlock &ExitingBB, 254 const BasicBlock &ExitBB) { 255 for (const Instruction &I : ExitBB) { 256 auto *PN = dyn_cast<PHINode>(&I); 257 if (!PN) 258 // No more PHIs to check. 259 return true; 260 261 // If the incoming value for this edge isn't loop invariant the unswitch 262 // won't be trivial. 263 if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) 264 return false; 265 } 266 llvm_unreachable("Basic blocks should never be empty!"); 267 } 268 269 /// Copy a set of loop invariant values \p ToDuplicate and insert them at the 270 /// end of \p BB and conditionally branch on the copied condition. We only 271 /// branch on a single value. 272 static void buildPartialUnswitchConditionalBranch( 273 BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, 274 BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, 275 const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { 276 IRBuilder<> IRB(&BB); 277 IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); 278 279 SmallVector<Value *> FrozenInvariants; 280 for (Value *Inv : Invariants) { 281 if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT)) 282 Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr"); 283 FrozenInvariants.push_back(Inv); 284 } 285 286 Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) 287 : IRB.CreateAnd(FrozenInvariants); 288 IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, 289 Direction ? &NormalSucc : &UnswitchedSucc); 290 } 291 292 /// Copy a set of loop invariant values, and conditionally branch on them. 293 static void buildPartialInvariantUnswitchConditionalBranch( 294 BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, 295 BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, 296 MemorySSAUpdater *MSSAU) { 297 ValueToValueMapTy VMap; 298 for (auto *Val : reverse(ToDuplicate)) { 299 Instruction *Inst = cast<Instruction>(Val); 300 Instruction *NewInst = Inst->clone(); 301 302 if (const DebugLoc &DL = Inst->getDebugLoc()) 303 mapAtomInstance(DL, VMap); 304 305 NewInst->insertInto(&BB, BB.end()); 306 RemapInstruction(NewInst, VMap, 307 RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 308 VMap[Val] = NewInst; 309 310 if (!MSSAU) 311 continue; 312 313 MemorySSA *MSSA = MSSAU->getMemorySSA(); 314 if (auto *MemUse = 315 dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(Inst))) { 316 auto *DefiningAccess = MemUse->getDefiningAccess(); 317 // Get the first defining access before the loop. 318 while (L.contains(DefiningAccess->getBlock())) { 319 // If the defining access is a MemoryPhi, get the incoming 320 // value for the pre-header as defining access. 321 if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) 322 DefiningAccess = 323 MemPhi->getIncomingValueForBlock(L.getLoopPreheader()); 324 else 325 DefiningAccess = cast<MemoryDef>(DefiningAccess)->getDefiningAccess(); 326 } 327 MSSAU->createMemoryAccessInBB(NewInst, DefiningAccess, 328 NewInst->getParent(), 329 MemorySSA::BeforeTerminator); 330 } 331 } 332 333 IRBuilder<> IRB(&BB); 334 IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); 335 Value *Cond = VMap[ToDuplicate[0]]; 336 IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, 337 Direction ? &NormalSucc : &UnswitchedSucc); 338 } 339 340 /// Rewrite the PHI nodes in an unswitched loop exit basic block. 341 /// 342 /// Requires that the loop exit and unswitched basic block are the same, and 343 /// that the exiting block was a unique predecessor of that block. Rewrites the 344 /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial 345 /// PHI nodes from the old preheader that now contains the unswitched 346 /// terminator. 347 static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, 348 BasicBlock &OldExitingBB, 349 BasicBlock &OldPH) { 350 for (PHINode &PN : UnswitchedBB.phis()) { 351 // When the loop exit is directly unswitched we just need to update the 352 // incoming basic block. We loop to handle weird cases with repeated 353 // incoming blocks, but expect to typically only have one operand here. 354 for (auto i : seq<int>(0, PN.getNumOperands())) { 355 assert(PN.getIncomingBlock(i) == &OldExitingBB && 356 "Found incoming block different from unique predecessor!"); 357 PN.setIncomingBlock(i, &OldPH); 358 } 359 } 360 } 361 362 /// Rewrite the PHI nodes in the loop exit basic block and the split off 363 /// unswitched block. 364 /// 365 /// Because the exit block remains an exit from the loop, this rewrites the 366 /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI 367 /// nodes into the unswitched basic block to select between the value in the 368 /// old preheader and the loop exit. 369 static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, 370 BasicBlock &UnswitchedBB, 371 BasicBlock &OldExitingBB, 372 BasicBlock &OldPH, 373 bool FullUnswitch) { 374 assert(&ExitBB != &UnswitchedBB && 375 "Must have different loop exit and unswitched blocks!"); 376 BasicBlock::iterator InsertPt = UnswitchedBB.begin(); 377 for (PHINode &PN : ExitBB.phis()) { 378 auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2, 379 PN.getName() + ".split"); 380 NewPN->insertBefore(InsertPt); 381 382 // Walk backwards over the old PHI node's inputs to minimize the cost of 383 // removing each one. We have to do this weird loop manually so that we 384 // create the same number of new incoming edges in the new PHI as we expect 385 // each case-based edge to be included in the unswitched switch in some 386 // cases. 387 // FIXME: This is really, really gross. It would be much cleaner if LLVM 388 // allowed us to create a single entry for a predecessor block without 389 // having separate entries for each "edge" even though these edges are 390 // required to produce identical results. 391 for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { 392 if (PN.getIncomingBlock(i) != &OldExitingBB) 393 continue; 394 395 Value *Incoming = PN.getIncomingValue(i); 396 if (FullUnswitch) 397 // No more edge from the old exiting block to the exit block. 398 PN.removeIncomingValue(i); 399 400 NewPN->addIncoming(Incoming, &OldPH); 401 } 402 403 // Now replace the old PHI with the new one and wire the old one in as an 404 // input to the new one. 405 PN.replaceAllUsesWith(NewPN); 406 NewPN->addIncoming(&PN, &ExitBB); 407 } 408 } 409 410 /// Hoist the current loop up to the innermost loop containing a remaining exit. 411 /// 412 /// Because we've removed an exit from the loop, we may have changed the set of 413 /// loops reachable and need to move the current loop up the loop nest or even 414 /// to an entirely separate nest. 415 static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, 416 DominatorTree &DT, LoopInfo &LI, 417 MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { 418 // If the loop is already at the top level, we can't hoist it anywhere. 419 Loop *OldParentL = L.getParentLoop(); 420 if (!OldParentL) 421 return; 422 423 SmallVector<BasicBlock *, 4> Exits; 424 L.getExitBlocks(Exits); 425 Loop *NewParentL = nullptr; 426 for (auto *ExitBB : Exits) 427 if (Loop *ExitL = LI.getLoopFor(ExitBB)) 428 if (!NewParentL || NewParentL->contains(ExitL)) 429 NewParentL = ExitL; 430 431 if (NewParentL == OldParentL) 432 return; 433 434 // The new parent loop (if different) should always contain the old one. 435 if (NewParentL) 436 assert(NewParentL->contains(OldParentL) && 437 "Can only hoist this loop up the nest!"); 438 439 // The preheader will need to move with the body of this loop. However, 440 // because it isn't in this loop we also need to update the primary loop map. 441 assert(OldParentL == LI.getLoopFor(&Preheader) && 442 "Parent loop of this loop should contain this loop's preheader!"); 443 LI.changeLoopFor(&Preheader, NewParentL); 444 445 // Remove this loop from its old parent. 446 OldParentL->removeChildLoop(&L); 447 448 // Add the loop either to the new parent or as a top-level loop. 449 if (NewParentL) 450 NewParentL->addChildLoop(&L); 451 else 452 LI.addTopLevelLoop(&L); 453 454 // Remove this loops blocks from the old parent and every other loop up the 455 // nest until reaching the new parent. Also update all of these 456 // no-longer-containing loops to reflect the nesting change. 457 for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; 458 OldContainingL = OldContainingL->getParentLoop()) { 459 llvm::erase_if(OldContainingL->getBlocksVector(), 460 [&](const BasicBlock *BB) { 461 return BB == &Preheader || L.contains(BB); 462 }); 463 464 OldContainingL->getBlocksSet().erase(&Preheader); 465 for (BasicBlock *BB : L.blocks()) 466 OldContainingL->getBlocksSet().erase(BB); 467 468 // Because we just hoisted a loop out of this one, we have essentially 469 // created new exit paths from it. That means we need to form LCSSA PHI 470 // nodes for values used in the no-longer-nested loop. 471 formLCSSA(*OldContainingL, DT, &LI, SE); 472 473 // We shouldn't need to form dedicated exits because the exit introduced 474 // here is the (just split by unswitching) preheader. However, after trivial 475 // unswitching it is possible to get new non-dedicated exits out of parent 476 // loop so let's conservatively form dedicated exit blocks and figure out 477 // if we can optimize later. 478 formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU, 479 /*PreserveLCSSA*/ true); 480 } 481 } 482 483 // Return the top-most loop containing ExitBB and having ExitBB as exiting block 484 // or the loop containing ExitBB, if there is no parent loop containing ExitBB 485 // as exiting block. 486 static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, 487 const LoopInfo &LI) { 488 Loop *TopMost = LI.getLoopFor(ExitBB); 489 Loop *Current = TopMost; 490 while (Current) { 491 if (Current->isLoopExiting(ExitBB)) 492 TopMost = Current; 493 Current = Current->getParentLoop(); 494 } 495 return TopMost; 496 } 497 498 /// Unswitch a trivial branch if the condition is loop invariant. 499 /// 500 /// This routine should only be called when loop code leading to the branch has 501 /// been validated as trivial (no side effects). This routine checks if the 502 /// condition is invariant and one of the successors is a loop exit. This 503 /// allows us to unswitch without duplicating the loop, making it trivial. 504 /// 505 /// If this routine fails to unswitch the branch it returns false. 506 /// 507 /// If the branch can be unswitched, this routine splits the preheader and 508 /// hoists the branch above that split. Preserves loop simplified form 509 /// (splitting the exit block as necessary). It simplifies the branch within 510 /// the loop to an unconditional branch but doesn't remove it entirely. Further 511 /// cleanup can be done with some simplifycfg like pass. 512 /// 513 /// If `SE` is not null, it will be updated based on the potential loop SCEVs 514 /// invalidated by this. 515 static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, 516 LoopInfo &LI, ScalarEvolution *SE, 517 MemorySSAUpdater *MSSAU) { 518 assert(BI.isConditional() && "Can only unswitch a conditional branch!"); 519 LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); 520 521 // The loop invariant values that we want to unswitch. 522 TinyPtrVector<Value *> Invariants; 523 524 // When true, we're fully unswitching the branch rather than just unswitching 525 // some input conditions to the branch. 526 bool FullUnswitch = false; 527 528 Value *Cond = skipTrivialSelect(BI.getCondition()); 529 if (L.isLoopInvariant(Cond)) { 530 Invariants.push_back(Cond); 531 FullUnswitch = true; 532 } else { 533 if (auto *CondInst = dyn_cast<Instruction>(Cond)) 534 Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI); 535 if (Invariants.empty()) { 536 LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n"); 537 return false; 538 } 539 } 540 541 // Check that one of the branch's successors exits, and which one. 542 bool ExitDirection = true; 543 int LoopExitSuccIdx = 0; 544 auto *LoopExitBB = BI.getSuccessor(0); 545 if (L.contains(LoopExitBB)) { 546 ExitDirection = false; 547 LoopExitSuccIdx = 1; 548 LoopExitBB = BI.getSuccessor(1); 549 if (L.contains(LoopExitBB)) { 550 LLVM_DEBUG(dbgs() << " Branch doesn't exit the loop!\n"); 551 return false; 552 } 553 } 554 auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx); 555 auto *ParentBB = BI.getParent(); 556 if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) { 557 LLVM_DEBUG(dbgs() << " Loop exit PHI's aren't loop-invariant!\n"); 558 return false; 559 } 560 561 // When unswitching only part of the branch's condition, we need the exit 562 // block to be reached directly from the partially unswitched input. This can 563 // be done when the exit block is along the true edge and the branch condition 564 // is a graph of `or` operations, or the exit block is along the false edge 565 // and the condition is a graph of `and` operations. 566 if (!FullUnswitch) { 567 if (ExitDirection ? !match(Cond, m_LogicalOr()) 568 : !match(Cond, m_LogicalAnd())) { 569 LLVM_DEBUG(dbgs() << " Branch condition is in improper form for " 570 "non-full unswitch!\n"); 571 return false; 572 } 573 } 574 575 LLVM_DEBUG({ 576 dbgs() << " unswitching trivial invariant conditions for: " << BI 577 << "\n"; 578 for (Value *Invariant : Invariants) { 579 dbgs() << " " << *Invariant << " == true"; 580 if (Invariant != Invariants.back()) 581 dbgs() << " ||"; 582 dbgs() << "\n"; 583 } 584 }); 585 586 // If we have scalar evolutions, we need to invalidate them including this 587 // loop, the loop containing the exit block and the topmost parent loop 588 // exiting via LoopExitBB. 589 if (SE) { 590 if (const Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) 591 SE->forgetLoop(ExitL); 592 else 593 // Forget the entire nest as this exits the entire nest. 594 SE->forgetTopmostLoop(&L); 595 SE->forgetBlockAndLoopDispositions(); 596 } 597 598 if (MSSAU && VerifyMemorySSA) 599 MSSAU->getMemorySSA()->verifyMemorySSA(); 600 601 // Split the preheader, so that we know that there is a safe place to insert 602 // the conditional branch. We will change the preheader to have a conditional 603 // branch on LoopCond. 604 BasicBlock *OldPH = L.getLoopPreheader(); 605 BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); 606 607 // Now that we have a place to insert the conditional branch, create a place 608 // to branch to: this is the exit block out of the loop that we are 609 // unswitching. We need to split this if there are other loop predecessors. 610 // Because the loop is in simplified form, *any* other predecessor is enough. 611 BasicBlock *UnswitchedBB; 612 if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { 613 assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && 614 "A branch's parent isn't a predecessor!"); 615 UnswitchedBB = LoopExitBB; 616 } else { 617 UnswitchedBB = 618 SplitBlock(LoopExitBB, LoopExitBB->begin(), &DT, &LI, MSSAU, "", false); 619 } 620 621 if (MSSAU && VerifyMemorySSA) 622 MSSAU->getMemorySSA()->verifyMemorySSA(); 623 624 // Actually move the invariant uses into the unswitched position. If possible, 625 // we do this by moving the instructions, but when doing partial unswitching 626 // we do it by building a new merge of the values in the unswitched position. 627 OldPH->getTerminator()->eraseFromParent(); 628 if (FullUnswitch) { 629 // If fully unswitching, we can use the existing branch instruction. 630 // Splice it into the old PH to gate reaching the new preheader and re-point 631 // its successors. 632 BI.moveBefore(*OldPH, OldPH->end()); 633 BI.setCondition(Cond); 634 if (MSSAU) { 635 // Temporarily clone the terminator, to make MSSA update cheaper by 636 // separating "insert edge" updates from "remove edge" ones. 637 BI.clone()->insertInto(ParentBB, ParentBB->end()); 638 } else { 639 // Create a new unconditional branch that will continue the loop as a new 640 // terminator. 641 Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB); 642 NewBI->setDebugLoc(BI.getDebugLoc()); 643 } 644 BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); 645 BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); 646 } else { 647 // Only unswitching a subset of inputs to the condition, so we will need to 648 // build a new branch that merges the invariant inputs. 649 if (ExitDirection) 650 assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) && 651 "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the " 652 "condition!"); 653 else 654 assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) && 655 "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" 656 " condition!"); 657 buildPartialUnswitchConditionalBranch( 658 *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, 659 FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); 660 } 661 662 // Update the dominator tree with the added edge. 663 DT.insertEdge(OldPH, UnswitchedBB); 664 665 // After the dominator tree was updated with the added edge, update MemorySSA 666 // if available. 667 if (MSSAU) { 668 SmallVector<CFGUpdate, 1> Updates; 669 Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); 670 MSSAU->applyInsertUpdates(Updates, DT); 671 } 672 673 // Finish updating dominator tree and memory ssa for full unswitch. 674 if (FullUnswitch) { 675 if (MSSAU) { 676 Instruction *Term = ParentBB->getTerminator(); 677 // Remove the cloned branch instruction and create unconditional branch 678 // now. 679 Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB); 680 NewBI->setDebugLoc(Term->getDebugLoc()); 681 Term->eraseFromParent(); 682 MSSAU->removeEdge(ParentBB, LoopExitBB); 683 } 684 DT.deleteEdge(ParentBB, LoopExitBB); 685 } 686 687 if (MSSAU && VerifyMemorySSA) 688 MSSAU->getMemorySSA()->verifyMemorySSA(); 689 690 // Rewrite the relevant PHI nodes. 691 if (UnswitchedBB == LoopExitBB) 692 rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); 693 else 694 rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, 695 *ParentBB, *OldPH, FullUnswitch); 696 697 // The constant we can replace all of our invariants with inside the loop 698 // body. If any of the invariants have a value other than this the loop won't 699 // be entered. 700 ConstantInt *Replacement = ExitDirection 701 ? ConstantInt::getFalse(BI.getContext()) 702 : ConstantInt::getTrue(BI.getContext()); 703 704 // Since this is an i1 condition we can also trivially replace uses of it 705 // within the loop with a constant. 706 for (Value *Invariant : Invariants) 707 replaceLoopInvariantUses(L, Invariant, *Replacement); 708 709 // If this was full unswitching, we may have changed the nesting relationship 710 // for this loop so hoist it to its correct parent if needed. 711 if (FullUnswitch) 712 hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); 713 714 if (MSSAU && VerifyMemorySSA) 715 MSSAU->getMemorySSA()->verifyMemorySSA(); 716 717 LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n"); 718 ++NumTrivial; 719 ++NumBranches; 720 return true; 721 } 722 723 /// Unswitch a trivial switch if the condition is loop invariant. 724 /// 725 /// This routine should only be called when loop code leading to the switch has 726 /// been validated as trivial (no side effects). This routine checks if the 727 /// condition is invariant and that at least one of the successors is a loop 728 /// exit. This allows us to unswitch without duplicating the loop, making it 729 /// trivial. 730 /// 731 /// If this routine fails to unswitch the switch it returns false. 732 /// 733 /// If the switch can be unswitched, this routine splits the preheader and 734 /// copies the switch above that split. If the default case is one of the 735 /// exiting cases, it copies the non-exiting cases and points them at the new 736 /// preheader. If the default case is not exiting, it copies the exiting cases 737 /// and points the default at the preheader. It preserves loop simplified form 738 /// (splitting the exit blocks as necessary). It simplifies the switch within 739 /// the loop by removing now-dead cases. If the default case is one of those 740 /// unswitched, it replaces its destination with a new basic block containing 741 /// only unreachable. Such basic blocks, while technically loop exits, are not 742 /// considered for unswitching so this is a stable transform and the same 743 /// switch will not be revisited. If after unswitching there is only a single 744 /// in-loop successor, the switch is further simplified to an unconditional 745 /// branch. Still more cleanup can be done with some simplifycfg like pass. 746 /// 747 /// If `SE` is not null, it will be updated based on the potential loop SCEVs 748 /// invalidated by this. 749 static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, 750 LoopInfo &LI, ScalarEvolution *SE, 751 MemorySSAUpdater *MSSAU) { 752 LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); 753 Value *LoopCond = SI.getCondition(); 754 755 // If this isn't switching on an invariant condition, we can't unswitch it. 756 if (!L.isLoopInvariant(LoopCond)) 757 return false; 758 759 auto *ParentBB = SI.getParent(); 760 761 // The same check must be used both for the default and the exit cases. We 762 // should never leave edges from the switch instruction to a basic block that 763 // we are unswitching, hence the condition used to determine the default case 764 // needs to also be used to populate ExitCaseIndices, which is then used to 765 // remove cases from the switch. 766 auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) { 767 // BBToCheck is not an exit block if it is inside loop L. 768 if (L.contains(&BBToCheck)) 769 return false; 770 // BBToCheck is not trivial to unswitch if its phis aren't loop invariant. 771 if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck)) 772 return false; 773 // We do not unswitch a block that only has an unreachable statement, as 774 // it's possible this is a previously unswitched block. Only unswitch if 775 // either the terminator is not unreachable, or, if it is, it's not the only 776 // instruction in the block. 777 auto *TI = BBToCheck.getTerminator(); 778 bool isUnreachable = isa<UnreachableInst>(TI); 779 return !isUnreachable || &*BBToCheck.getFirstNonPHIOrDbg() != TI; 780 }; 781 782 SmallVector<int, 4> ExitCaseIndices; 783 for (auto Case : SI.cases()) 784 if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor())) 785 ExitCaseIndices.push_back(Case.getCaseIndex()); 786 BasicBlock *DefaultExitBB = nullptr; 787 SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight = 788 SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0); 789 if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) { 790 DefaultExitBB = SI.getDefaultDest(); 791 } else if (ExitCaseIndices.empty()) 792 return false; 793 794 LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n"); 795 796 if (MSSAU && VerifyMemorySSA) 797 MSSAU->getMemorySSA()->verifyMemorySSA(); 798 799 // We may need to invalidate SCEVs for the outermost loop reached by any of 800 // the exits. 801 Loop *OuterL = &L; 802 803 if (DefaultExitBB) { 804 // Check the loop containing this exit. 805 Loop *ExitL = getTopMostExitingLoop(DefaultExitBB, LI); 806 if (!ExitL || ExitL->contains(OuterL)) 807 OuterL = ExitL; 808 } 809 for (unsigned Index : ExitCaseIndices) { 810 auto CaseI = SI.case_begin() + Index; 811 // Compute the outer loop from this exit. 812 Loop *ExitL = getTopMostExitingLoop(CaseI->getCaseSuccessor(), LI); 813 if (!ExitL || ExitL->contains(OuterL)) 814 OuterL = ExitL; 815 } 816 817 if (SE) { 818 if (OuterL) 819 SE->forgetLoop(OuterL); 820 else 821 SE->forgetTopmostLoop(&L); 822 } 823 824 if (DefaultExitBB) { 825 // Clear out the default destination temporarily to allow accurate 826 // predecessor lists to be examined below. 827 SI.setDefaultDest(nullptr); 828 } 829 830 // Store the exit cases into a separate data structure and remove them from 831 // the switch. 832 SmallVector<std::tuple<ConstantInt *, BasicBlock *, 833 SwitchInstProfUpdateWrapper::CaseWeightOpt>, 834 4> ExitCases; 835 ExitCases.reserve(ExitCaseIndices.size()); 836 SwitchInstProfUpdateWrapper SIW(SI); 837 // We walk the case indices backwards so that we remove the last case first 838 // and don't disrupt the earlier indices. 839 for (unsigned Index : reverse(ExitCaseIndices)) { 840 auto CaseI = SI.case_begin() + Index; 841 // Save the value of this case. 842 auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex()); 843 ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W); 844 // Delete the unswitched cases. 845 SIW.removeCase(CaseI); 846 } 847 848 // Check if after this all of the remaining cases point at the same 849 // successor. 850 BasicBlock *CommonSuccBB = nullptr; 851 if (SI.getNumCases() > 0 && 852 all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) { 853 return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor(); 854 })) 855 CommonSuccBB = SI.case_begin()->getCaseSuccessor(); 856 if (!DefaultExitBB) { 857 // If we're not unswitching the default, we need it to match any cases to 858 // have a common successor or if we have no cases it is the common 859 // successor. 860 if (SI.getNumCases() == 0) 861 CommonSuccBB = SI.getDefaultDest(); 862 else if (SI.getDefaultDest() != CommonSuccBB) 863 CommonSuccBB = nullptr; 864 } 865 866 // Split the preheader, so that we know that there is a safe place to insert 867 // the switch. 868 BasicBlock *OldPH = L.getLoopPreheader(); 869 BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); 870 OldPH->getTerminator()->eraseFromParent(); 871 872 // Now add the unswitched switch. This new switch instruction inherits the 873 // debug location of the old switch, because it semantically replace the old 874 // one. 875 auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); 876 NewSI->setDebugLoc(SIW->getDebugLoc()); 877 SwitchInstProfUpdateWrapper NewSIW(*NewSI); 878 879 // Rewrite the IR for the unswitched basic blocks. This requires two steps. 880 // First, we split any exit blocks with remaining in-loop predecessors. Then 881 // we update the PHIs in one of two ways depending on if there was a split. 882 // We walk in reverse so that we split in the same order as the cases 883 // appeared. This is purely for convenience of reading the resulting IR, but 884 // it doesn't cost anything really. 885 SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; 886 SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; 887 // Handle the default exit if necessary. 888 // FIXME: It'd be great if we could merge this with the loop below but LLVM's 889 // ranges aren't quite powerful enough yet. 890 if (DefaultExitBB) { 891 if (pred_empty(DefaultExitBB)) { 892 UnswitchedExitBBs.insert(DefaultExitBB); 893 rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); 894 } else { 895 auto *SplitBB = 896 SplitBlock(DefaultExitBB, DefaultExitBB->begin(), &DT, &LI, MSSAU); 897 rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, 898 *ParentBB, *OldPH, 899 /*FullUnswitch*/ true); 900 DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; 901 } 902 } 903 // Note that we must use a reference in the for loop so that we update the 904 // container. 905 for (auto &ExitCase : reverse(ExitCases)) { 906 // Grab a reference to the exit block in the pair so that we can update it. 907 BasicBlock *ExitBB = std::get<1>(ExitCase); 908 909 // If this case is the last edge into the exit block, we can simply reuse it 910 // as it will no longer be a loop exit. No mapping necessary. 911 if (pred_empty(ExitBB)) { 912 // Only rewrite once. 913 if (UnswitchedExitBBs.insert(ExitBB).second) 914 rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH); 915 continue; 916 } 917 918 // Otherwise we need to split the exit block so that we retain an exit 919 // block from the loop and a target for the unswitched condition. 920 BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; 921 if (!SplitExitBB) { 922 // If this is the first time we see this, do the split and remember it. 923 SplitExitBB = SplitBlock(ExitBB, ExitBB->begin(), &DT, &LI, MSSAU); 924 rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, 925 *ParentBB, *OldPH, 926 /*FullUnswitch*/ true); 927 } 928 // Update the case pair to point to the split block. 929 std::get<1>(ExitCase) = SplitExitBB; 930 } 931 932 // Now add the unswitched cases. We do this in reverse order as we built them 933 // in reverse order. 934 for (auto &ExitCase : reverse(ExitCases)) { 935 ConstantInt *CaseVal = std::get<0>(ExitCase); 936 BasicBlock *UnswitchedBB = std::get<1>(ExitCase); 937 938 NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase)); 939 } 940 941 // If the default was unswitched, re-point it and add explicit cases for 942 // entering the loop. 943 if (DefaultExitBB) { 944 NewSIW->setDefaultDest(DefaultExitBB); 945 NewSIW.setSuccessorWeight(0, DefaultCaseWeight); 946 947 // We removed all the exit cases, so we just copy the cases to the 948 // unswitched switch. 949 for (const auto &Case : SI.cases()) 950 NewSIW.addCase(Case.getCaseValue(), NewPH, 951 SIW.getSuccessorWeight(Case.getSuccessorIndex())); 952 } else if (DefaultCaseWeight) { 953 // We have to set branch weight of the default case. 954 uint64_t SW = *DefaultCaseWeight; 955 for (const auto &Case : SI.cases()) { 956 auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex()); 957 assert(W && 958 "case weight must be defined as default case weight is defined"); 959 SW += *W; 960 } 961 NewSIW.setSuccessorWeight(0, SW); 962 } 963 964 // If we ended up with a common successor for every path through the switch 965 // after unswitching, rewrite it to an unconditional branch to make it easy 966 // to recognize. Otherwise we potentially have to recognize the default case 967 // pointing at unreachable and other complexity. 968 if (CommonSuccBB) { 969 BasicBlock *BB = SI.getParent(); 970 // We may have had multiple edges to this common successor block, so remove 971 // them as predecessors. We skip the first one, either the default or the 972 // actual first case. 973 bool SkippedFirst = DefaultExitBB == nullptr; 974 for (auto Case : SI.cases()) { 975 assert(Case.getCaseSuccessor() == CommonSuccBB && 976 "Non-common successor!"); 977 (void)Case; 978 if (!SkippedFirst) { 979 SkippedFirst = true; 980 continue; 981 } 982 CommonSuccBB->removePredecessor(BB, 983 /*KeepOneInputPHIs*/ true); 984 } 985 // Now nuke the switch and replace it with a direct branch. 986 Instruction *NewBI = BranchInst::Create(CommonSuccBB, BB); 987 NewBI->setDebugLoc(SIW->getDebugLoc()); 988 SIW.eraseFromParent(); 989 } else if (DefaultExitBB) { 990 assert(SI.getNumCases() > 0 && 991 "If we had no cases we'd have a common successor!"); 992 // Move the last case to the default successor. This is valid as if the 993 // default got unswitched it cannot be reached. This has the advantage of 994 // being simple and keeping the number of edges from this switch to 995 // successors the same, and avoiding any PHI update complexity. 996 auto LastCaseI = std::prev(SI.case_end()); 997 998 SI.setDefaultDest(LastCaseI->getCaseSuccessor()); 999 SIW.setSuccessorWeight( 1000 0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex())); 1001 SIW.removeCase(LastCaseI); 1002 } 1003 1004 // Walk the unswitched exit blocks and the unswitched split blocks and update 1005 // the dominator tree based on the CFG edits. While we are walking unordered 1006 // containers here, the API for applyUpdates takes an unordered list of 1007 // updates and requires them to not contain duplicates. 1008 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 1009 for (auto *UnswitchedExitBB : UnswitchedExitBBs) { 1010 DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB}); 1011 DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB}); 1012 } 1013 for (auto SplitUnswitchedPair : SplitExitBBMap) { 1014 DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first}); 1015 DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second}); 1016 } 1017 1018 if (MSSAU) { 1019 MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true); 1020 if (VerifyMemorySSA) 1021 MSSAU->getMemorySSA()->verifyMemorySSA(); 1022 } else { 1023 DT.applyUpdates(DTUpdates); 1024 } 1025 1026 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1027 1028 // We may have changed the nesting relationship for this loop so hoist it to 1029 // its correct parent if needed. 1030 hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); 1031 1032 if (MSSAU && VerifyMemorySSA) 1033 MSSAU->getMemorySSA()->verifyMemorySSA(); 1034 1035 ++NumTrivial; 1036 ++NumSwitches; 1037 LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n"); 1038 return true; 1039 } 1040 1041 /// This routine scans the loop to find a branch or switch which occurs before 1042 /// any side effects occur. These can potentially be unswitched without 1043 /// duplicating the loop. If a branch or switch is successfully unswitched the 1044 /// scanning continues to see if subsequent branches or switches have become 1045 /// trivial. Once all trivial candidates have been unswitched, this routine 1046 /// returns. 1047 /// 1048 /// The return value indicates whether anything was unswitched (and therefore 1049 /// changed). 1050 /// 1051 /// If `SE` is not null, it will be updated based on the potential loop SCEVs 1052 /// invalidated by this. 1053 static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, 1054 LoopInfo &LI, ScalarEvolution *SE, 1055 MemorySSAUpdater *MSSAU) { 1056 bool Changed = false; 1057 1058 // If loop header has only one reachable successor we should keep looking for 1059 // trivial condition candidates in the successor as well. An alternative is 1060 // to constant fold conditions and merge successors into loop header (then we 1061 // only need to check header's terminator). The reason for not doing this in 1062 // LoopUnswitch pass is that it could potentially break LoopPassManager's 1063 // invariants. Folding dead branches could either eliminate the current loop 1064 // or make other loops unreachable. LCSSA form might also not be preserved 1065 // after deleting branches. The following code keeps traversing loop header's 1066 // successors until it finds the trivial condition candidate (condition that 1067 // is not a constant). Since unswitching generates branches with constant 1068 // conditions, this scenario could be very common in practice. 1069 BasicBlock *CurrentBB = L.getHeader(); 1070 SmallPtrSet<BasicBlock *, 8> Visited; 1071 Visited.insert(CurrentBB); 1072 do { 1073 // Check if there are any side-effecting instructions (e.g. stores, calls, 1074 // volatile loads) in the part of the loop that the code *would* execute 1075 // without unswitching. 1076 if (MSSAU) // Possible early exit with MSSA 1077 if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB)) 1078 if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end())) 1079 return Changed; 1080 if (llvm::any_of(*CurrentBB, 1081 [](Instruction &I) { return I.mayHaveSideEffects(); })) 1082 return Changed; 1083 1084 Instruction *CurrentTerm = CurrentBB->getTerminator(); 1085 1086 if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) { 1087 // Don't bother trying to unswitch past a switch with a constant 1088 // condition. This should be removed prior to running this pass by 1089 // simplifycfg. 1090 if (isa<Constant>(SI->getCondition())) 1091 return Changed; 1092 1093 if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU)) 1094 // Couldn't unswitch this one so we're done. 1095 return Changed; 1096 1097 // Mark that we managed to unswitch something. 1098 Changed = true; 1099 1100 // If unswitching turned the terminator into an unconditional branch then 1101 // we can continue. The unswitching logic specifically works to fold any 1102 // cases it can into an unconditional branch to make it easier to 1103 // recognize here. 1104 auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator()); 1105 if (!BI || BI->isConditional()) 1106 return Changed; 1107 1108 CurrentBB = BI->getSuccessor(0); 1109 continue; 1110 } 1111 1112 auto *BI = dyn_cast<BranchInst>(CurrentTerm); 1113 if (!BI) 1114 // We do not understand other terminator instructions. 1115 return Changed; 1116 1117 // Don't bother trying to unswitch past an unconditional branch or a branch 1118 // with a constant value. These should be removed by simplifycfg prior to 1119 // running this pass. 1120 if (!BI->isConditional() || 1121 isa<Constant>(skipTrivialSelect(BI->getCondition()))) 1122 return Changed; 1123 1124 // Found a trivial condition candidate: non-foldable conditional branch. If 1125 // we fail to unswitch this, we can't do anything else that is trivial. 1126 if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU)) 1127 return Changed; 1128 1129 // Mark that we managed to unswitch something. 1130 Changed = true; 1131 1132 // If we only unswitched some of the conditions feeding the branch, we won't 1133 // have collapsed it to a single successor. 1134 BI = cast<BranchInst>(CurrentBB->getTerminator()); 1135 if (BI->isConditional()) 1136 return Changed; 1137 1138 // Follow the newly unconditional branch into its successor. 1139 CurrentBB = BI->getSuccessor(0); 1140 1141 // When continuing, if we exit the loop or reach a previous visited block, 1142 // then we can not reach any trivial condition candidates (unfoldable 1143 // branch instructions or switch instructions) and no unswitch can happen. 1144 } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second); 1145 1146 return Changed; 1147 } 1148 1149 /// Build the cloned blocks for an unswitched copy of the given loop. 1150 /// 1151 /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and 1152 /// after the split block (`SplitBB`) that will be used to select between the 1153 /// cloned and original loop. 1154 /// 1155 /// This routine handles cloning all of the necessary loop blocks and exit 1156 /// blocks including rewriting their instructions and the relevant PHI nodes. 1157 /// Any loop blocks or exit blocks which are dominated by a different successor 1158 /// than the one for this clone of the loop blocks can be trivially skipped. We 1159 /// use the `DominatingSucc` map to determine whether a block satisfies that 1160 /// property with a simple map lookup. 1161 /// 1162 /// It also correctly creates the unconditional branch in the cloned 1163 /// unswitched parent block to only point at the unswitched successor. 1164 /// 1165 /// This does not handle most of the necessary updates to `LoopInfo`. Only exit 1166 /// block splitting is correctly reflected in `LoopInfo`, essentially all of 1167 /// the cloned blocks (and their loops) are left without full `LoopInfo` 1168 /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned 1169 /// blocks to them but doesn't create the cloned `DominatorTree` structure and 1170 /// instead the caller must recompute an accurate DT. It *does* correctly 1171 /// update the `AssumptionCache` provided in `AC`. 1172 static BasicBlock *buildClonedLoopBlocks( 1173 Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, 1174 ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, 1175 BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, 1176 const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, 1177 ValueToValueMapTy &VMap, 1178 SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, 1179 DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, 1180 ScalarEvolution *SE) { 1181 SmallVector<BasicBlock *, 4> NewBlocks; 1182 NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); 1183 1184 // We will need to clone a bunch of blocks, wrap up the clone operation in 1185 // a helper. 1186 auto CloneBlock = [&](BasicBlock *OldBB) { 1187 // Clone the basic block and insert it before the new preheader. 1188 BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent()); 1189 NewBB->moveBefore(LoopPH); 1190 1191 // Record this block and the mapping. 1192 NewBlocks.push_back(NewBB); 1193 VMap[OldBB] = NewBB; 1194 1195 return NewBB; 1196 }; 1197 1198 // We skip cloning blocks when they have a dominating succ that is not the 1199 // succ we are cloning for. 1200 auto SkipBlock = [&](BasicBlock *BB) { 1201 auto It = DominatingSucc.find(BB); 1202 return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; 1203 }; 1204 1205 // First, clone the preheader. 1206 auto *ClonedPH = CloneBlock(LoopPH); 1207 1208 // Then clone all the loop blocks, skipping the ones that aren't necessary. 1209 for (auto *LoopBB : L.blocks()) 1210 if (!SkipBlock(LoopBB)) 1211 CloneBlock(LoopBB); 1212 1213 // Split all the loop exit edges so that when we clone the exit blocks, if 1214 // any of the exit blocks are *also* a preheader for some other loop, we 1215 // don't create multiple predecessors entering the loop header. 1216 for (auto *ExitBB : ExitBlocks) { 1217 if (SkipBlock(ExitBB)) 1218 continue; 1219 1220 // When we are going to clone an exit, we don't need to clone all the 1221 // instructions in the exit block and we want to ensure we have an easy 1222 // place to merge the CFG, so split the exit first. This is always safe to 1223 // do because there cannot be any non-loop predecessors of a loop exit in 1224 // loop simplified form. 1225 auto *MergeBB = SplitBlock(ExitBB, ExitBB->begin(), &DT, &LI, MSSAU); 1226 1227 // Rearrange the names to make it easier to write test cases by having the 1228 // exit block carry the suffix rather than the merge block carrying the 1229 // suffix. 1230 MergeBB->takeName(ExitBB); 1231 ExitBB->setName(Twine(MergeBB->getName()) + ".split"); 1232 1233 // Now clone the original exit block. 1234 auto *ClonedExitBB = CloneBlock(ExitBB); 1235 assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && 1236 "Exit block should have been split to have one successor!"); 1237 assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && 1238 "Cloned exit block has the wrong successor!"); 1239 1240 // Remap any cloned instructions and create a merge phi node for them. 1241 for (auto ZippedInsts : llvm::zip_first( 1242 llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())), 1243 llvm::make_range(ClonedExitBB->begin(), 1244 std::prev(ClonedExitBB->end())))) { 1245 Instruction &I = std::get<0>(ZippedInsts); 1246 Instruction &ClonedI = std::get<1>(ZippedInsts); 1247 1248 // The only instructions in the exit block should be PHI nodes and 1249 // potentially a landing pad. 1250 assert( 1251 (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && 1252 "Bad instruction in exit block!"); 1253 // We should have a value map between the instruction and its clone. 1254 assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!"); 1255 1256 // Forget SCEVs based on exit phis in case SCEV looked through the phi. 1257 if (SE) 1258 if (auto *PN = dyn_cast<PHINode>(&I)) 1259 SE->forgetLcssaPhiWithNewPredecessor(&L, PN); 1260 1261 BasicBlock::iterator InsertPt = MergeBB->getFirstInsertionPt(); 1262 1263 auto *MergePN = 1264 PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi"); 1265 MergePN->insertBefore(InsertPt); 1266 MergePN->setDebugLoc(InsertPt->getDebugLoc()); 1267 I.replaceAllUsesWith(MergePN); 1268 MergePN->addIncoming(&I, ExitBB); 1269 MergePN->addIncoming(&ClonedI, ClonedExitBB); 1270 } 1271 } 1272 1273 // Rewrite the instructions in the cloned blocks to refer to the instructions 1274 // in the cloned blocks. We have to do this as a second pass so that we have 1275 // everything available. Also, we have inserted new instructions which may 1276 // include assume intrinsics, so we update the assumption cache while 1277 // processing this. 1278 Module *M = ClonedPH->getParent()->getParent(); 1279 for (auto *ClonedBB : NewBlocks) 1280 for (Instruction &I : *ClonedBB) { 1281 RemapDbgRecordRange(M, I.getDbgRecordRange(), VMap, 1282 RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 1283 RemapInstruction(&I, VMap, 1284 RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 1285 if (auto *II = dyn_cast<AssumeInst>(&I)) 1286 AC.registerAssumption(II); 1287 } 1288 1289 // Update any PHI nodes in the cloned successors of the skipped blocks to not 1290 // have spurious incoming values. 1291 for (auto *LoopBB : L.blocks()) 1292 if (SkipBlock(LoopBB)) 1293 for (auto *SuccBB : successors(LoopBB)) 1294 if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB))) 1295 for (PHINode &PN : ClonedSuccBB->phis()) 1296 PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false); 1297 1298 // Remove the cloned parent as a predecessor of any successor we ended up 1299 // cloning other than the unswitched one. 1300 auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); 1301 for (auto *SuccBB : successors(ParentBB)) { 1302 if (SuccBB == UnswitchedSuccBB) 1303 continue; 1304 1305 auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)); 1306 if (!ClonedSuccBB) 1307 continue; 1308 1309 ClonedSuccBB->removePredecessor(ClonedParentBB, 1310 /*KeepOneInputPHIs*/ true); 1311 } 1312 1313 // Replace the cloned branch with an unconditional branch to the cloned 1314 // unswitched successor. 1315 auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); 1316 Instruction *ClonedTerminator = ClonedParentBB->getTerminator(); 1317 // Trivial Simplification. If Terminator is a conditional branch and 1318 // condition becomes dead - erase it. 1319 Value *ClonedConditionToErase = nullptr; 1320 if (auto *BI = dyn_cast<BranchInst>(ClonedTerminator)) 1321 ClonedConditionToErase = BI->getCondition(); 1322 else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator)) 1323 ClonedConditionToErase = SI->getCondition(); 1324 1325 Instruction *BI = BranchInst::Create(ClonedSuccBB, ClonedParentBB); 1326 BI->setDebugLoc(ClonedTerminator->getDebugLoc()); 1327 ClonedTerminator->eraseFromParent(); 1328 1329 if (ClonedConditionToErase) 1330 RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr, 1331 MSSAU); 1332 1333 // If there are duplicate entries in the PHI nodes because of multiple edges 1334 // to the unswitched successor, we need to nuke all but one as we replaced it 1335 // with a direct branch. 1336 for (PHINode &PN : ClonedSuccBB->phis()) { 1337 bool Found = false; 1338 // Loop over the incoming operands backwards so we can easily delete as we 1339 // go without invalidating the index. 1340 for (int i = PN.getNumOperands() - 1; i >= 0; --i) { 1341 if (PN.getIncomingBlock(i) != ClonedParentBB) 1342 continue; 1343 if (!Found) { 1344 Found = true; 1345 continue; 1346 } 1347 PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false); 1348 } 1349 } 1350 1351 // Record the domtree updates for the new blocks. 1352 SmallPtrSet<BasicBlock *, 4> SuccSet; 1353 for (auto *ClonedBB : NewBlocks) { 1354 for (auto *SuccBB : successors(ClonedBB)) 1355 if (SuccSet.insert(SuccBB).second) 1356 DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB}); 1357 SuccSet.clear(); 1358 } 1359 1360 return ClonedPH; 1361 } 1362 1363 /// Recursively clone the specified loop and all of its children. 1364 /// 1365 /// The target parent loop for the clone should be provided, or can be null if 1366 /// the clone is a top-level loop. While cloning, all the blocks are mapped 1367 /// with the provided value map. The entire original loop must be present in 1368 /// the value map. The cloned loop is returned. 1369 static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, 1370 const ValueToValueMapTy &VMap, LoopInfo &LI) { 1371 auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { 1372 assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!"); 1373 ClonedL.reserveBlocks(OrigL.getNumBlocks()); 1374 for (auto *BB : OrigL.blocks()) { 1375 auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB)); 1376 ClonedL.addBlockEntry(ClonedBB); 1377 if (LI.getLoopFor(BB) == &OrigL) 1378 LI.changeLoopFor(ClonedBB, &ClonedL); 1379 } 1380 }; 1381 1382 // We specially handle the first loop because it may get cloned into 1383 // a different parent and because we most commonly are cloning leaf loops. 1384 Loop *ClonedRootL = LI.AllocateLoop(); 1385 if (RootParentL) 1386 RootParentL->addChildLoop(ClonedRootL); 1387 else 1388 LI.addTopLevelLoop(ClonedRootL); 1389 AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); 1390 1391 if (OrigRootL.isInnermost()) 1392 return ClonedRootL; 1393 1394 // If we have a nest, we can quickly clone the entire loop nest using an 1395 // iterative approach because it is a tree. We keep the cloned parent in the 1396 // data structure to avoid repeatedly querying through a map to find it. 1397 SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; 1398 // Build up the loops to clone in reverse order as we'll clone them from the 1399 // back. 1400 for (Loop *ChildL : llvm::reverse(OrigRootL)) 1401 LoopsToClone.push_back({ClonedRootL, ChildL}); 1402 do { 1403 Loop *ClonedParentL, *L; 1404 std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val(); 1405 Loop *ClonedL = LI.AllocateLoop(); 1406 ClonedParentL->addChildLoop(ClonedL); 1407 AddClonedBlocksToLoop(*L, *ClonedL); 1408 for (Loop *ChildL : llvm::reverse(*L)) 1409 LoopsToClone.push_back({ClonedL, ChildL}); 1410 } while (!LoopsToClone.empty()); 1411 1412 return ClonedRootL; 1413 } 1414 1415 /// Build the cloned loops of an original loop from unswitching. 1416 /// 1417 /// Because unswitching simplifies the CFG of the loop, this isn't a trivial 1418 /// operation. We need to re-verify that there even is a loop (as the backedge 1419 /// may not have been cloned), and even if there are remaining backedges the 1420 /// backedge set may be different. However, we know that each child loop is 1421 /// undisturbed, we only need to find where to place each child loop within 1422 /// either any parent loop or within a cloned version of the original loop. 1423 /// 1424 /// Because child loops may end up cloned outside of any cloned version of the 1425 /// original loop, multiple cloned sibling loops may be created. All of them 1426 /// are returned so that the newly introduced loop nest roots can be 1427 /// identified. 1428 static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, 1429 const ValueToValueMapTy &VMap, LoopInfo &LI, 1430 SmallVectorImpl<Loop *> &NonChildClonedLoops) { 1431 Loop *ClonedL = nullptr; 1432 1433 auto *OrigPH = OrigL.getLoopPreheader(); 1434 auto *OrigHeader = OrigL.getHeader(); 1435 1436 auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH)); 1437 auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader)); 1438 1439 // We need to know the loops of the cloned exit blocks to even compute the 1440 // accurate parent loop. If we only clone exits to some parent of the 1441 // original parent, we want to clone into that outer loop. We also keep track 1442 // of the loops that our cloned exit blocks participate in. 1443 Loop *ParentL = nullptr; 1444 SmallVector<BasicBlock *, 4> ClonedExitsInLoops; 1445 SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; 1446 ClonedExitsInLoops.reserve(ExitBlocks.size()); 1447 for (auto *ExitBB : ExitBlocks) 1448 if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB))) 1449 if (Loop *ExitL = LI.getLoopFor(ExitBB)) { 1450 ExitLoopMap[ClonedExitBB] = ExitL; 1451 ClonedExitsInLoops.push_back(ClonedExitBB); 1452 if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) 1453 ParentL = ExitL; 1454 } 1455 assert((!ParentL || ParentL == OrigL.getParentLoop() || 1456 ParentL->contains(OrigL.getParentLoop())) && 1457 "The computed parent loop should always contain (or be) the parent of " 1458 "the original loop."); 1459 1460 // We build the set of blocks dominated by the cloned header from the set of 1461 // cloned blocks out of the original loop. While not all of these will 1462 // necessarily be in the cloned loop, it is enough to establish that they 1463 // aren't in unreachable cycles, etc. 1464 SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; 1465 for (auto *BB : OrigL.blocks()) 1466 if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB))) 1467 ClonedLoopBlocks.insert(ClonedBB); 1468 1469 // Rebuild the set of blocks that will end up in the cloned loop. We may have 1470 // skipped cloning some region of this loop which can in turn skip some of 1471 // the backedges so we have to rebuild the blocks in the loop based on the 1472 // backedges that remain after cloning. 1473 SmallVector<BasicBlock *, 16> Worklist; 1474 SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; 1475 for (auto *Pred : predecessors(ClonedHeader)) { 1476 // The only possible non-loop header predecessor is the preheader because 1477 // we know we cloned the loop in simplified form. 1478 if (Pred == ClonedPH) 1479 continue; 1480 1481 // Because the loop was in simplified form, the only non-loop predecessor 1482 // should be the preheader. 1483 assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " 1484 "header other than the preheader " 1485 "that is not part of the loop!"); 1486 1487 // Insert this block into the loop set and on the first visit (and if it 1488 // isn't the header we're currently walking) put it into the worklist to 1489 // recurse through. 1490 if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader) 1491 Worklist.push_back(Pred); 1492 } 1493 1494 // If we had any backedges then there *is* a cloned loop. Put the header into 1495 // the loop set and then walk the worklist backwards to find all the blocks 1496 // that remain within the loop after cloning. 1497 if (!BlocksInClonedLoop.empty()) { 1498 BlocksInClonedLoop.insert(ClonedHeader); 1499 1500 while (!Worklist.empty()) { 1501 BasicBlock *BB = Worklist.pop_back_val(); 1502 assert(BlocksInClonedLoop.count(BB) && 1503 "Didn't put block into the loop set!"); 1504 1505 // Insert any predecessors that are in the possible set into the cloned 1506 // set, and if the insert is successful, add them to the worklist. Note 1507 // that we filter on the blocks that are definitely reachable via the 1508 // backedge to the loop header so we may prune out dead code within the 1509 // cloned loop. 1510 for (auto *Pred : predecessors(BB)) 1511 if (ClonedLoopBlocks.count(Pred) && 1512 BlocksInClonedLoop.insert(Pred).second) 1513 Worklist.push_back(Pred); 1514 } 1515 1516 ClonedL = LI.AllocateLoop(); 1517 if (ParentL) { 1518 ParentL->addBasicBlockToLoop(ClonedPH, LI); 1519 ParentL->addChildLoop(ClonedL); 1520 } else { 1521 LI.addTopLevelLoop(ClonedL); 1522 } 1523 NonChildClonedLoops.push_back(ClonedL); 1524 1525 ClonedL->reserveBlocks(BlocksInClonedLoop.size()); 1526 // We don't want to just add the cloned loop blocks based on how we 1527 // discovered them. The original order of blocks was carefully built in 1528 // a way that doesn't rely on predecessor ordering. Rather than re-invent 1529 // that logic, we just re-walk the original blocks (and those of the child 1530 // loops) and filter them as we add them into the cloned loop. 1531 for (auto *BB : OrigL.blocks()) { 1532 auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)); 1533 if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB)) 1534 continue; 1535 1536 // Directly add the blocks that are only in this loop. 1537 if (LI.getLoopFor(BB) == &OrigL) { 1538 ClonedL->addBasicBlockToLoop(ClonedBB, LI); 1539 continue; 1540 } 1541 1542 // We want to manually add it to this loop and parents. 1543 // Registering it with LoopInfo will happen when we clone the top 1544 // loop for this block. 1545 for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) 1546 PL->addBlockEntry(ClonedBB); 1547 } 1548 1549 // Now add each child loop whose header remains within the cloned loop. All 1550 // of the blocks within the loop must satisfy the same constraints as the 1551 // header so once we pass the header checks we can just clone the entire 1552 // child loop nest. 1553 for (Loop *ChildL : OrigL) { 1554 auto *ClonedChildHeader = 1555 cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); 1556 if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader)) 1557 continue; 1558 1559 #ifndef NDEBUG 1560 // We should never have a cloned child loop header but fail to have 1561 // all of the blocks for that child loop. 1562 for (auto *ChildLoopBB : ChildL->blocks()) 1563 assert(BlocksInClonedLoop.count( 1564 cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && 1565 "Child cloned loop has a header within the cloned outer " 1566 "loop but not all of its blocks!"); 1567 #endif 1568 1569 cloneLoopNest(*ChildL, ClonedL, VMap, LI); 1570 } 1571 } 1572 1573 // Now that we've handled all the components of the original loop that were 1574 // cloned into a new loop, we still need to handle anything from the original 1575 // loop that wasn't in a cloned loop. 1576 1577 // Figure out what blocks are left to place within any loop nest containing 1578 // the unswitched loop. If we never formed a loop, the cloned PH is one of 1579 // them. 1580 SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; 1581 if (BlocksInClonedLoop.empty()) 1582 UnloopedBlockSet.insert(ClonedPH); 1583 for (auto *ClonedBB : ClonedLoopBlocks) 1584 if (!BlocksInClonedLoop.count(ClonedBB)) 1585 UnloopedBlockSet.insert(ClonedBB); 1586 1587 // Copy the cloned exits and sort them in ascending loop depth, we'll work 1588 // backwards across these to process them inside out. The order shouldn't 1589 // matter as we're just trying to build up the map from inside-out; we use 1590 // the map in a more stably ordered way below. 1591 auto OrderedClonedExitsInLoops = ClonedExitsInLoops; 1592 llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) { 1593 return ExitLoopMap.lookup(LHS)->getLoopDepth() < 1594 ExitLoopMap.lookup(RHS)->getLoopDepth(); 1595 }); 1596 1597 // Populate the existing ExitLoopMap with everything reachable from each 1598 // exit, starting from the inner most exit. 1599 while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { 1600 assert(Worklist.empty() && "Didn't clear worklist!"); 1601 1602 BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); 1603 Loop *ExitL = ExitLoopMap.lookup(ExitBB); 1604 1605 // Walk the CFG back until we hit the cloned PH adding everything reachable 1606 // and in the unlooped set to this exit block's loop. 1607 Worklist.push_back(ExitBB); 1608 do { 1609 BasicBlock *BB = Worklist.pop_back_val(); 1610 // We can stop recursing at the cloned preheader (if we get there). 1611 if (BB == ClonedPH) 1612 continue; 1613 1614 for (BasicBlock *PredBB : predecessors(BB)) { 1615 // If this pred has already been moved to our set or is part of some 1616 // (inner) loop, no update needed. 1617 if (!UnloopedBlockSet.erase(PredBB)) { 1618 assert( 1619 (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && 1620 "Predecessor not mapped to a loop!"); 1621 continue; 1622 } 1623 1624 // We just insert into the loop set here. We'll add these blocks to the 1625 // exit loop after we build up the set in an order that doesn't rely on 1626 // predecessor order (which in turn relies on use list order). 1627 bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second; 1628 (void)Inserted; 1629 assert(Inserted && "Should only visit an unlooped block once!"); 1630 1631 // And recurse through to its predecessors. 1632 Worklist.push_back(PredBB); 1633 } 1634 } while (!Worklist.empty()); 1635 } 1636 1637 // Now that the ExitLoopMap gives as mapping for all the non-looping cloned 1638 // blocks to their outer loops, walk the cloned blocks and the cloned exits 1639 // in their original order adding them to the correct loop. 1640 1641 // We need a stable insertion order. We use the order of the original loop 1642 // order and map into the correct parent loop. 1643 for (auto *BB : llvm::concat<BasicBlock *const>( 1644 ArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops)) 1645 if (Loop *OuterL = ExitLoopMap.lookup(BB)) 1646 OuterL->addBasicBlockToLoop(BB, LI); 1647 1648 #ifndef NDEBUG 1649 for (auto &BBAndL : ExitLoopMap) { 1650 auto *BB = BBAndL.first; 1651 auto *OuterL = BBAndL.second; 1652 assert(LI.getLoopFor(BB) == OuterL && 1653 "Failed to put all blocks into outer loops!"); 1654 } 1655 #endif 1656 1657 // Now that all the blocks are placed into the correct containing loop in the 1658 // absence of child loops, find all the potentially cloned child loops and 1659 // clone them into whatever outer loop we placed their header into. 1660 for (Loop *ChildL : OrigL) { 1661 auto *ClonedChildHeader = 1662 cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); 1663 if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader)) 1664 continue; 1665 1666 #ifndef NDEBUG 1667 for (auto *ChildLoopBB : ChildL->blocks()) 1668 assert(VMap.count(ChildLoopBB) && 1669 "Cloned a child loop header but not all of that loops blocks!"); 1670 #endif 1671 1672 NonChildClonedLoops.push_back(cloneLoopNest( 1673 *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI)); 1674 } 1675 } 1676 1677 static void 1678 deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, 1679 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, 1680 DominatorTree &DT, MemorySSAUpdater *MSSAU) { 1681 // Find all the dead clones, and remove them from their successors. 1682 SmallVector<BasicBlock *, 16> DeadBlocks; 1683 for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) 1684 for (const auto &VMap : VMaps) 1685 if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB))) 1686 if (!DT.isReachableFromEntry(ClonedBB)) { 1687 for (BasicBlock *SuccBB : successors(ClonedBB)) 1688 SuccBB->removePredecessor(ClonedBB); 1689 DeadBlocks.push_back(ClonedBB); 1690 } 1691 1692 // Remove all MemorySSA in the dead blocks 1693 if (MSSAU) { 1694 SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(), 1695 DeadBlocks.end()); 1696 MSSAU->removeBlocks(DeadBlockSet); 1697 } 1698 1699 // Drop any remaining references to break cycles. 1700 for (BasicBlock *BB : DeadBlocks) 1701 BB->dropAllReferences(); 1702 // Erase them from the IR. 1703 for (BasicBlock *BB : DeadBlocks) 1704 BB->eraseFromParent(); 1705 } 1706 1707 static void deleteDeadBlocksFromLoop(Loop &L, 1708 SmallVectorImpl<BasicBlock *> &ExitBlocks, 1709 DominatorTree &DT, LoopInfo &LI, 1710 MemorySSAUpdater *MSSAU, 1711 ScalarEvolution *SE, 1712 LPMUpdater &LoopUpdater) { 1713 // Find all the dead blocks tied to this loop, and remove them from their 1714 // successors. 1715 SmallSetVector<BasicBlock *, 8> DeadBlockSet; 1716 1717 // Start with loop/exit blocks and get a transitive closure of reachable dead 1718 // blocks. 1719 SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(), 1720 ExitBlocks.end()); 1721 DeathCandidates.append(L.blocks().begin(), L.blocks().end()); 1722 while (!DeathCandidates.empty()) { 1723 auto *BB = DeathCandidates.pop_back_val(); 1724 if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) { 1725 for (BasicBlock *SuccBB : successors(BB)) { 1726 SuccBB->removePredecessor(BB); 1727 DeathCandidates.push_back(SuccBB); 1728 } 1729 DeadBlockSet.insert(BB); 1730 } 1731 } 1732 1733 // Remove all MemorySSA in the dead blocks 1734 if (MSSAU) 1735 MSSAU->removeBlocks(DeadBlockSet); 1736 1737 // Filter out the dead blocks from the exit blocks list so that it can be 1738 // used in the caller. 1739 llvm::erase_if(ExitBlocks, 1740 [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); 1741 1742 // Walk from this loop up through its parents removing all of the dead blocks. 1743 for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { 1744 for (auto *BB : DeadBlockSet) 1745 ParentL->getBlocksSet().erase(BB); 1746 llvm::erase_if(ParentL->getBlocksVector(), 1747 [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); 1748 } 1749 1750 // Now delete the dead child loops. This raw delete will clear them 1751 // recursively. 1752 llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) { 1753 if (!DeadBlockSet.count(ChildL->getHeader())) 1754 return false; 1755 1756 assert(llvm::all_of(ChildL->blocks(), 1757 [&](BasicBlock *ChildBB) { 1758 return DeadBlockSet.count(ChildBB); 1759 }) && 1760 "If the child loop header is dead all blocks in the child loop must " 1761 "be dead as well!"); 1762 LoopUpdater.markLoopAsDeleted(*ChildL, ChildL->getName()); 1763 if (SE) 1764 SE->forgetBlockAndLoopDispositions(); 1765 LI.destroy(ChildL); 1766 return true; 1767 }); 1768 1769 // Remove the loop mappings for the dead blocks and drop all the references 1770 // from these blocks to others to handle cyclic references as we start 1771 // deleting the blocks themselves. 1772 for (auto *BB : DeadBlockSet) { 1773 // Check that the dominator tree has already been updated. 1774 assert(!DT.getNode(BB) && "Should already have cleared domtree!"); 1775 LI.changeLoopFor(BB, nullptr); 1776 // Drop all uses of the instructions to make sure we won't have dangling 1777 // uses in other blocks. 1778 for (auto &I : *BB) 1779 if (!I.use_empty()) 1780 I.replaceAllUsesWith(PoisonValue::get(I.getType())); 1781 BB->dropAllReferences(); 1782 } 1783 1784 // Actually delete the blocks now that they've been fully unhooked from the 1785 // IR. 1786 for (auto *BB : DeadBlockSet) 1787 BB->eraseFromParent(); 1788 } 1789 1790 /// Recompute the set of blocks in a loop after unswitching. 1791 /// 1792 /// This walks from the original headers predecessors to rebuild the loop. We 1793 /// take advantage of the fact that new blocks can't have been added, and so we 1794 /// filter by the original loop's blocks. This also handles potentially 1795 /// unreachable code that we don't want to explore but might be found examining 1796 /// the predecessors of the header. 1797 /// 1798 /// If the original loop is no longer a loop, this will return an empty set. If 1799 /// it remains a loop, all the blocks within it will be added to the set 1800 /// (including those blocks in inner loops). 1801 static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, 1802 LoopInfo &LI) { 1803 SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; 1804 1805 auto *PH = L.getLoopPreheader(); 1806 auto *Header = L.getHeader(); 1807 1808 // A worklist to use while walking backwards from the header. 1809 SmallVector<BasicBlock *, 16> Worklist; 1810 1811 // First walk the predecessors of the header to find the backedges. This will 1812 // form the basis of our walk. 1813 for (auto *Pred : predecessors(Header)) { 1814 // Skip the preheader. 1815 if (Pred == PH) 1816 continue; 1817 1818 // Because the loop was in simplified form, the only non-loop predecessor 1819 // is the preheader. 1820 assert(L.contains(Pred) && "Found a predecessor of the loop header other " 1821 "than the preheader that is not part of the " 1822 "loop!"); 1823 1824 // Insert this block into the loop set and on the first visit and, if it 1825 // isn't the header we're currently walking, put it into the worklist to 1826 // recurse through. 1827 if (LoopBlockSet.insert(Pred).second && Pred != Header) 1828 Worklist.push_back(Pred); 1829 } 1830 1831 // If no backedges were found, we're done. 1832 if (LoopBlockSet.empty()) 1833 return LoopBlockSet; 1834 1835 // We found backedges, recurse through them to identify the loop blocks. 1836 while (!Worklist.empty()) { 1837 BasicBlock *BB = Worklist.pop_back_val(); 1838 assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!"); 1839 1840 // No need to walk past the header. 1841 if (BB == Header) 1842 continue; 1843 1844 // Because we know the inner loop structure remains valid we can use the 1845 // loop structure to jump immediately across the entire nested loop. 1846 // Further, because it is in loop simplified form, we can directly jump 1847 // to its preheader afterward. 1848 if (Loop *InnerL = LI.getLoopFor(BB)) 1849 if (InnerL != &L) { 1850 assert(L.contains(InnerL) && 1851 "Should not reach a loop *outside* this loop!"); 1852 // The preheader is the only possible predecessor of the loop so 1853 // insert it into the set and check whether it was already handled. 1854 auto *InnerPH = InnerL->getLoopPreheader(); 1855 assert(L.contains(InnerPH) && "Cannot contain an inner loop block " 1856 "but not contain the inner loop " 1857 "preheader!"); 1858 if (!LoopBlockSet.insert(InnerPH).second) 1859 // The only way to reach the preheader is through the loop body 1860 // itself so if it has been visited the loop is already handled. 1861 continue; 1862 1863 // Insert all of the blocks (other than those already present) into 1864 // the loop set. We expect at least the block that led us to find the 1865 // inner loop to be in the block set, but we may also have other loop 1866 // blocks if they were already enqueued as predecessors of some other 1867 // outer loop block. 1868 for (auto *InnerBB : InnerL->blocks()) { 1869 if (InnerBB == BB) { 1870 assert(LoopBlockSet.count(InnerBB) && 1871 "Block should already be in the set!"); 1872 continue; 1873 } 1874 1875 LoopBlockSet.insert(InnerBB); 1876 } 1877 1878 // Add the preheader to the worklist so we will continue past the 1879 // loop body. 1880 Worklist.push_back(InnerPH); 1881 continue; 1882 } 1883 1884 // Insert any predecessors that were in the original loop into the new 1885 // set, and if the insert is successful, add them to the worklist. 1886 for (auto *Pred : predecessors(BB)) 1887 if (L.contains(Pred) && LoopBlockSet.insert(Pred).second) 1888 Worklist.push_back(Pred); 1889 } 1890 1891 assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!"); 1892 1893 // We've found all the blocks participating in the loop, return our completed 1894 // set. 1895 return LoopBlockSet; 1896 } 1897 1898 /// Rebuild a loop after unswitching removes some subset of blocks and edges. 1899 /// 1900 /// The removal may have removed some child loops entirely but cannot have 1901 /// disturbed any remaining child loops. However, they may need to be hoisted 1902 /// to the parent loop (or to be top-level loops). The original loop may be 1903 /// completely removed. 1904 /// 1905 /// The sibling loops resulting from this update are returned. If the original 1906 /// loop remains a valid loop, it will be the first entry in this list with all 1907 /// of the newly sibling loops following it. 1908 /// 1909 /// Returns true if the loop remains a loop after unswitching, and false if it 1910 /// is no longer a loop after unswitching (and should not continue to be 1911 /// referenced). 1912 static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, 1913 LoopInfo &LI, 1914 SmallVectorImpl<Loop *> &HoistedLoops, 1915 ScalarEvolution *SE) { 1916 auto *PH = L.getLoopPreheader(); 1917 1918 // Compute the actual parent loop from the exit blocks. Because we may have 1919 // pruned some exits the loop may be different from the original parent. 1920 Loop *ParentL = nullptr; 1921 SmallVector<Loop *, 4> ExitLoops; 1922 SmallVector<BasicBlock *, 4> ExitsInLoops; 1923 ExitsInLoops.reserve(ExitBlocks.size()); 1924 for (auto *ExitBB : ExitBlocks) 1925 if (Loop *ExitL = LI.getLoopFor(ExitBB)) { 1926 ExitLoops.push_back(ExitL); 1927 ExitsInLoops.push_back(ExitBB); 1928 if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) 1929 ParentL = ExitL; 1930 } 1931 1932 // Recompute the blocks participating in this loop. This may be empty if it 1933 // is no longer a loop. 1934 auto LoopBlockSet = recomputeLoopBlockSet(L, LI); 1935 1936 // If we still have a loop, we need to re-set the loop's parent as the exit 1937 // block set changing may have moved it within the loop nest. Note that this 1938 // can only happen when this loop has a parent as it can only hoist the loop 1939 // *up* the nest. 1940 if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { 1941 // Remove this loop's (original) blocks from all of the intervening loops. 1942 for (Loop *IL = L.getParentLoop(); IL != ParentL; 1943 IL = IL->getParentLoop()) { 1944 IL->getBlocksSet().erase(PH); 1945 for (auto *BB : L.blocks()) 1946 IL->getBlocksSet().erase(BB); 1947 llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) { 1948 return BB == PH || L.contains(BB); 1949 }); 1950 } 1951 1952 LI.changeLoopFor(PH, ParentL); 1953 L.getParentLoop()->removeChildLoop(&L); 1954 if (ParentL) 1955 ParentL->addChildLoop(&L); 1956 else 1957 LI.addTopLevelLoop(&L); 1958 } 1959 1960 // Now we update all the blocks which are no longer within the loop. 1961 auto &Blocks = L.getBlocksVector(); 1962 auto BlocksSplitI = 1963 LoopBlockSet.empty() 1964 ? Blocks.begin() 1965 : std::stable_partition( 1966 Blocks.begin(), Blocks.end(), 1967 [&](BasicBlock *BB) { return LoopBlockSet.count(BB); }); 1968 1969 // Before we erase the list of unlooped blocks, build a set of them. 1970 SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); 1971 if (LoopBlockSet.empty()) 1972 UnloopedBlocks.insert(PH); 1973 1974 // Now erase these blocks from the loop. 1975 for (auto *BB : make_range(BlocksSplitI, Blocks.end())) 1976 L.getBlocksSet().erase(BB); 1977 Blocks.erase(BlocksSplitI, Blocks.end()); 1978 1979 // Sort the exits in ascending loop depth, we'll work backwards across these 1980 // to process them inside out. 1981 llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) { 1982 return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS); 1983 }); 1984 1985 // We'll build up a set for each exit loop. 1986 SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; 1987 Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. 1988 1989 auto RemoveUnloopedBlocksFromLoop = 1990 [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { 1991 for (auto *BB : UnloopedBlocks) 1992 L.getBlocksSet().erase(BB); 1993 llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) { 1994 return UnloopedBlocks.count(BB); 1995 }); 1996 }; 1997 1998 SmallVector<BasicBlock *, 16> Worklist; 1999 while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { 2000 assert(Worklist.empty() && "Didn't clear worklist!"); 2001 assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!"); 2002 2003 // Grab the next exit block, in decreasing loop depth order. 2004 BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); 2005 Loop &ExitL = *LI.getLoopFor(ExitBB); 2006 assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!"); 2007 2008 // Erase all of the unlooped blocks from the loops between the previous 2009 // exit loop and this exit loop. This works because the ExitInLoops list is 2010 // sorted in increasing order of loop depth and thus we visit loops in 2011 // decreasing order of loop depth. 2012 for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) 2013 RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); 2014 2015 // Walk the CFG back until we hit the cloned PH adding everything reachable 2016 // and in the unlooped set to this exit block's loop. 2017 Worklist.push_back(ExitBB); 2018 do { 2019 BasicBlock *BB = Worklist.pop_back_val(); 2020 // We can stop recursing at the cloned preheader (if we get there). 2021 if (BB == PH) 2022 continue; 2023 2024 for (BasicBlock *PredBB : predecessors(BB)) { 2025 // If this pred has already been moved to our set or is part of some 2026 // (inner) loop, no update needed. 2027 if (!UnloopedBlocks.erase(PredBB)) { 2028 assert((NewExitLoopBlocks.count(PredBB) || 2029 ExitL.contains(LI.getLoopFor(PredBB))) && 2030 "Predecessor not in a nested loop (or already visited)!"); 2031 continue; 2032 } 2033 2034 // We just insert into the loop set here. We'll add these blocks to the 2035 // exit loop after we build up the set in a deterministic order rather 2036 // than the predecessor-influenced visit order. 2037 bool Inserted = NewExitLoopBlocks.insert(PredBB).second; 2038 (void)Inserted; 2039 assert(Inserted && "Should only visit an unlooped block once!"); 2040 2041 // And recurse through to its predecessors. 2042 Worklist.push_back(PredBB); 2043 } 2044 } while (!Worklist.empty()); 2045 2046 // If blocks in this exit loop were directly part of the original loop (as 2047 // opposed to a child loop) update the map to point to this exit loop. This 2048 // just updates a map and so the fact that the order is unstable is fine. 2049 for (auto *BB : NewExitLoopBlocks) 2050 if (Loop *BBL = LI.getLoopFor(BB)) 2051 if (BBL == &L || !L.contains(BBL)) 2052 LI.changeLoopFor(BB, &ExitL); 2053 2054 // We will remove the remaining unlooped blocks from this loop in the next 2055 // iteration or below. 2056 NewExitLoopBlocks.clear(); 2057 } 2058 2059 // Any remaining unlooped blocks are no longer part of any loop unless they 2060 // are part of some child loop. 2061 for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) 2062 RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); 2063 for (auto *BB : UnloopedBlocks) 2064 if (Loop *BBL = LI.getLoopFor(BB)) 2065 if (BBL == &L || !L.contains(BBL)) 2066 LI.changeLoopFor(BB, nullptr); 2067 2068 // Sink all the child loops whose headers are no longer in the loop set to 2069 // the parent (or to be top level loops). We reach into the loop and directly 2070 // update its subloop vector to make this batch update efficient. 2071 auto &SubLoops = L.getSubLoopsVector(); 2072 auto SubLoopsSplitI = 2073 LoopBlockSet.empty() 2074 ? SubLoops.begin() 2075 : std::stable_partition( 2076 SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) { 2077 return LoopBlockSet.count(SubL->getHeader()); 2078 }); 2079 for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) { 2080 HoistedLoops.push_back(HoistedL); 2081 HoistedL->setParentLoop(nullptr); 2082 2083 // To compute the new parent of this hoisted loop we look at where we 2084 // placed the preheader above. We can't lookup the header itself because we 2085 // retained the mapping from the header to the hoisted loop. But the 2086 // preheader and header should have the exact same new parent computed 2087 // based on the set of exit blocks from the original loop as the preheader 2088 // is a predecessor of the header and so reached in the reverse walk. And 2089 // because the loops were all in simplified form the preheader of the 2090 // hoisted loop can't be part of some *other* loop. 2091 if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader())) 2092 NewParentL->addChildLoop(HoistedL); 2093 else 2094 LI.addTopLevelLoop(HoistedL); 2095 } 2096 SubLoops.erase(SubLoopsSplitI, SubLoops.end()); 2097 2098 // Actually delete the loop if nothing remained within it. 2099 if (Blocks.empty()) { 2100 assert(SubLoops.empty() && 2101 "Failed to remove all subloops from the original loop!"); 2102 if (Loop *ParentL = L.getParentLoop()) 2103 ParentL->removeChildLoop(llvm::find(*ParentL, &L)); 2104 else 2105 LI.removeLoop(llvm::find(LI, &L)); 2106 // markLoopAsDeleted for L should be triggered by the caller (it is 2107 // typically done within postUnswitch). 2108 if (SE) 2109 SE->forgetBlockAndLoopDispositions(); 2110 LI.destroy(&L); 2111 return false; 2112 } 2113 2114 return true; 2115 } 2116 2117 /// Helper to visit a dominator subtree, invoking a callable on each node. 2118 /// 2119 /// Returning false at any point will stop walking past that node of the tree. 2120 template <typename CallableT> 2121 void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { 2122 SmallVector<DomTreeNode *, 4> DomWorklist; 2123 DomWorklist.push_back(DT[BB]); 2124 #ifndef NDEBUG 2125 SmallPtrSet<DomTreeNode *, 4> Visited; 2126 Visited.insert(DT[BB]); 2127 #endif 2128 do { 2129 DomTreeNode *N = DomWorklist.pop_back_val(); 2130 2131 // Visit this node. 2132 if (!Callable(N->getBlock())) 2133 continue; 2134 2135 // Accumulate the child nodes. 2136 for (DomTreeNode *ChildN : *N) { 2137 assert(Visited.insert(ChildN).second && 2138 "Cannot visit a node twice when walking a tree!"); 2139 DomWorklist.push_back(ChildN); 2140 } 2141 } while (!DomWorklist.empty()); 2142 } 2143 2144 void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, 2145 bool CurrentLoopValid, bool PartiallyInvariant, 2146 bool InjectedCondition, ArrayRef<Loop *> NewLoops) { 2147 // If we did a non-trivial unswitch, we have added new (cloned) loops. 2148 if (!NewLoops.empty()) 2149 U.addSiblingLoops(NewLoops); 2150 2151 // If the current loop remains valid, we should revisit it to catch any 2152 // other unswitch opportunities. Otherwise, we need to mark it as deleted. 2153 if (CurrentLoopValid) { 2154 if (PartiallyInvariant) { 2155 // Mark the new loop as partially unswitched, to avoid unswitching on 2156 // the same condition again. 2157 auto &Context = L.getHeader()->getContext(); 2158 MDNode *DisableUnswitchMD = MDNode::get( 2159 Context, 2160 MDString::get(Context, "llvm.loop.unswitch.partial.disable")); 2161 MDNode *NewLoopID = makePostTransformationMetadata( 2162 Context, L.getLoopID(), {"llvm.loop.unswitch.partial"}, 2163 {DisableUnswitchMD}); 2164 L.setLoopID(NewLoopID); 2165 } else if (InjectedCondition) { 2166 // Do the same for injection of invariant conditions. 2167 auto &Context = L.getHeader()->getContext(); 2168 MDNode *DisableUnswitchMD = MDNode::get( 2169 Context, 2170 MDString::get(Context, "llvm.loop.unswitch.injection.disable")); 2171 MDNode *NewLoopID = makePostTransformationMetadata( 2172 Context, L.getLoopID(), {"llvm.loop.unswitch.injection"}, 2173 {DisableUnswitchMD}); 2174 L.setLoopID(NewLoopID); 2175 } else 2176 U.revisitCurrentLoop(); 2177 } else 2178 U.markLoopAsDeleted(L, LoopName); 2179 } 2180 2181 static void unswitchNontrivialInvariants( 2182 Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, 2183 IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, 2184 AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, 2185 LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) { 2186 auto *ParentBB = TI.getParent(); 2187 BranchInst *BI = dyn_cast<BranchInst>(&TI); 2188 SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI); 2189 2190 // Save the current loop name in a variable so that we can report it even 2191 // after it has been deleted. 2192 std::string LoopName(L.getName()); 2193 2194 // We can only unswitch switches, conditional branches with an invariant 2195 // condition, or combining invariant conditions with an instruction or 2196 // partially invariant instructions. 2197 assert((SI || (BI && BI->isConditional())) && 2198 "Can only unswitch switches and conditional branch!"); 2199 bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty(); 2200 bool FullUnswitch = 2201 SI || (skipTrivialSelect(BI->getCondition()) == Invariants[0] && 2202 !PartiallyInvariant); 2203 if (FullUnswitch) 2204 assert(Invariants.size() == 1 && 2205 "Cannot have other invariants with full unswitching!"); 2206 else 2207 assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) && 2208 "Partial unswitching requires an instruction as the condition!"); 2209 2210 if (MSSAU && VerifyMemorySSA) 2211 MSSAU->getMemorySSA()->verifyMemorySSA(); 2212 2213 // Constant and BBs tracking the cloned and continuing successor. When we are 2214 // unswitching the entire condition, this can just be trivially chosen to 2215 // unswitch towards `true`. However, when we are unswitching a set of 2216 // invariants combined with `and` or `or` or partially invariant instructions, 2217 // the combining operation determines the best direction to unswitch: we want 2218 // to unswitch the direction that will collapse the branch. 2219 bool Direction = true; 2220 int ClonedSucc = 0; 2221 if (!FullUnswitch) { 2222 Value *Cond = skipTrivialSelect(BI->getCondition()); 2223 (void)Cond; 2224 assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) || 2225 PartiallyInvariant) && 2226 "Only `or`, `and`, an `select`, partially invariant instructions " 2227 "can combine invariants being unswitched."); 2228 if (!match(Cond, m_LogicalOr())) { 2229 if (match(Cond, m_LogicalAnd()) || 2230 (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) { 2231 Direction = false; 2232 ClonedSucc = 1; 2233 } 2234 } 2235 } 2236 2237 BasicBlock *RetainedSuccBB = 2238 BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest(); 2239 SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; 2240 if (BI) 2241 UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc)); 2242 else 2243 for (auto Case : SI->cases()) 2244 if (Case.getCaseSuccessor() != RetainedSuccBB) 2245 UnswitchedSuccBBs.insert(Case.getCaseSuccessor()); 2246 2247 assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && 2248 "Should not unswitch the same successor we are retaining!"); 2249 2250 // The branch should be in this exact loop. Any inner loop's invariant branch 2251 // should be handled by unswitching that inner loop. The caller of this 2252 // routine should filter out any candidates that remain (but were skipped for 2253 // whatever reason). 2254 assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!"); 2255 2256 // Compute the parent loop now before we start hacking on things. 2257 Loop *ParentL = L.getParentLoop(); 2258 // Get blocks in RPO order for MSSA update, before changing the CFG. 2259 LoopBlocksRPO LBRPO(&L); 2260 if (MSSAU) 2261 LBRPO.perform(&LI); 2262 2263 // Compute the outer-most loop containing one of our exit blocks. This is the 2264 // furthest up our loopnest which can be mutated, which we will use below to 2265 // update things. 2266 Loop *OuterExitL = &L; 2267 SmallVector<BasicBlock *, 4> ExitBlocks; 2268 L.getUniqueExitBlocks(ExitBlocks); 2269 for (auto *ExitBB : ExitBlocks) { 2270 // ExitBB can be an exit block for several levels in the loop nest. Make 2271 // sure we find the top most. 2272 Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI); 2273 if (!NewOuterExitL) { 2274 // We exited the entire nest with this block, so we're done. 2275 OuterExitL = nullptr; 2276 break; 2277 } 2278 if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL)) 2279 OuterExitL = NewOuterExitL; 2280 } 2281 2282 // At this point, we're definitely going to unswitch something so invalidate 2283 // any cached information in ScalarEvolution for the outer most loop 2284 // containing an exit block and all nested loops. 2285 if (SE) { 2286 if (OuterExitL) 2287 SE->forgetLoop(OuterExitL); 2288 else 2289 SE->forgetTopmostLoop(&L); 2290 SE->forgetBlockAndLoopDispositions(); 2291 } 2292 2293 // If the edge from this terminator to a successor dominates that successor, 2294 // store a map from each block in its dominator subtree to it. This lets us 2295 // tell when cloning for a particular successor if a block is dominated by 2296 // some *other* successor with a single data structure. We use this to 2297 // significantly reduce cloning. 2298 SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; 2299 for (auto *SuccBB : llvm::concat<BasicBlock *const>(ArrayRef(RetainedSuccBB), 2300 UnswitchedSuccBBs)) 2301 if (SuccBB->getUniquePredecessor() || 2302 llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { 2303 return PredBB == ParentBB || DT.dominates(SuccBB, PredBB); 2304 })) 2305 visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) { 2306 DominatingSucc[BB] = SuccBB; 2307 return true; 2308 }); 2309 2310 // Split the preheader, so that we know that there is a safe place to insert 2311 // the conditional branch. We will change the preheader to have a conditional 2312 // branch on LoopCond. The original preheader will become the split point 2313 // between the unswitched versions, and we will have a new preheader for the 2314 // original loop. 2315 BasicBlock *SplitBB = L.getLoopPreheader(); 2316 BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU); 2317 2318 // Keep track of the dominator tree updates needed. 2319 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 2320 2321 // Clone the loop for each unswitched successor. 2322 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 2323 VMaps.reserve(UnswitchedSuccBBs.size()); 2324 SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; 2325 for (auto *SuccBB : UnswitchedSuccBBs) { 2326 VMaps.emplace_back(new ValueToValueMapTy()); 2327 ClonedPHs[SuccBB] = buildClonedLoopBlocks( 2328 L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, 2329 DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU, SE); 2330 } 2331 2332 // Drop metadata if we may break its semantics by moving this instr into the 2333 // split block. 2334 if (TI.getMetadata(LLVMContext::MD_make_implicit)) { 2335 if (DropNonTrivialImplicitNullChecks) 2336 // Do not spend time trying to understand if we can keep it, just drop it 2337 // to save compile time. 2338 TI.setMetadata(LLVMContext::MD_make_implicit, nullptr); 2339 else { 2340 // It is only legal to preserve make.implicit metadata if we are 2341 // guaranteed no reach implicit null check after following this branch. 2342 ICFLoopSafetyInfo SafetyInfo; 2343 SafetyInfo.computeLoopSafetyInfo(&L); 2344 if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) 2345 TI.setMetadata(LLVMContext::MD_make_implicit, nullptr); 2346 } 2347 } 2348 2349 // The stitching of the branched code back together depends on whether we're 2350 // doing full unswitching or not with the exception that we always want to 2351 // nuke the initial terminator placed in the split block. 2352 SplitBB->getTerminator()->eraseFromParent(); 2353 if (FullUnswitch) { 2354 // Keep a clone of the terminator for MSSA updates. 2355 Instruction *NewTI = TI.clone(); 2356 NewTI->insertInto(ParentBB, ParentBB->end()); 2357 2358 // Splice the terminator from the original loop and rewrite its 2359 // successors. 2360 TI.moveBefore(*SplitBB, SplitBB->end()); 2361 TI.dropLocation(); 2362 2363 // First wire up the moved terminator to the preheaders. 2364 if (BI) { 2365 BasicBlock *ClonedPH = ClonedPHs.begin()->second; 2366 BI->setSuccessor(ClonedSucc, ClonedPH); 2367 BI->setSuccessor(1 - ClonedSucc, LoopPH); 2368 Value *Cond = skipTrivialSelect(BI->getCondition()); 2369 if (InsertFreeze) { 2370 // We don't give any debug location to the new freeze, because the 2371 // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted 2372 // out of the loop. 2373 Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator()); 2374 cast<Instruction>(Cond)->setDebugLoc(DebugLoc::getDropped()); 2375 } 2376 BI->setCondition(Cond); 2377 DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); 2378 } else { 2379 assert(SI && "Must either be a branch or switch!"); 2380 2381 // Walk the cases and directly update their successors. 2382 assert(SI->getDefaultDest() == RetainedSuccBB && 2383 "Not retaining default successor!"); 2384 SI->setDefaultDest(LoopPH); 2385 for (const auto &Case : SI->cases()) 2386 if (Case.getCaseSuccessor() == RetainedSuccBB) 2387 Case.setSuccessor(LoopPH); 2388 else 2389 Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); 2390 2391 if (InsertFreeze) 2392 SI->setCondition(new FreezeInst(SI->getCondition(), 2393 SI->getCondition()->getName() + ".fr", 2394 SI->getIterator())); 2395 2396 // We need to use the set to populate domtree updates as even when there 2397 // are multiple cases pointing at the same successor we only want to 2398 // remove and insert one edge in the domtree. 2399 for (BasicBlock *SuccBB : UnswitchedSuccBBs) 2400 DTUpdates.push_back( 2401 {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); 2402 } 2403 2404 if (MSSAU) { 2405 DT.applyUpdates(DTUpdates); 2406 DTUpdates.clear(); 2407 2408 // Remove all but one edge to the retained block and all unswitched 2409 // blocks. This is to avoid having duplicate entries in the cloned Phis, 2410 // when we know we only keep a single edge for each case. 2411 MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB); 2412 for (BasicBlock *SuccBB : UnswitchedSuccBBs) 2413 MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB); 2414 2415 for (auto &VMap : VMaps) 2416 MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, 2417 /*IgnoreIncomingWithNoClones=*/true); 2418 MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); 2419 2420 // Remove all edges to unswitched blocks. 2421 for (BasicBlock *SuccBB : UnswitchedSuccBBs) 2422 MSSAU->removeEdge(ParentBB, SuccBB); 2423 } 2424 2425 // Now unhook the successor relationship as we'll be replacing 2426 // the terminator with a direct branch. This is much simpler for branches 2427 // than switches so we handle those first. 2428 if (BI) { 2429 // Remove the parent as a predecessor of the unswitched successor. 2430 assert(UnswitchedSuccBBs.size() == 1 && 2431 "Only one possible unswitched block for a branch!"); 2432 BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); 2433 UnswitchedSuccBB->removePredecessor(ParentBB, 2434 /*KeepOneInputPHIs*/ true); 2435 DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); 2436 } else { 2437 // Note that we actually want to remove the parent block as a predecessor 2438 // of *every* case successor. The case successor is either unswitched, 2439 // completely eliminating an edge from the parent to that successor, or it 2440 // is a duplicate edge to the retained successor as the retained successor 2441 // is always the default successor and as we'll replace this with a direct 2442 // branch we no longer need the duplicate entries in the PHI nodes. 2443 SwitchInst *NewSI = cast<SwitchInst>(NewTI); 2444 assert(NewSI->getDefaultDest() == RetainedSuccBB && 2445 "Not retaining default successor!"); 2446 for (const auto &Case : NewSI->cases()) 2447 Case.getCaseSuccessor()->removePredecessor( 2448 ParentBB, 2449 /*KeepOneInputPHIs*/ true); 2450 2451 // We need to use the set to populate domtree updates as even when there 2452 // are multiple cases pointing at the same successor we only want to 2453 // remove and insert one edge in the domtree. 2454 for (BasicBlock *SuccBB : UnswitchedSuccBBs) 2455 DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); 2456 } 2457 2458 // Create a new unconditional branch to the continuing block (as opposed to 2459 // the one cloned). 2460 Instruction *NewBI = BranchInst::Create(RetainedSuccBB, ParentBB); 2461 NewBI->setDebugLoc(NewTI->getDebugLoc()); 2462 2463 // After MSSAU update, remove the cloned terminator instruction NewTI. 2464 NewTI->eraseFromParent(); 2465 } else { 2466 assert(BI && "Only branches have partial unswitching."); 2467 assert(UnswitchedSuccBBs.size() == 1 && 2468 "Only one possible unswitched block for a branch!"); 2469 BasicBlock *ClonedPH = ClonedPHs.begin()->second; 2470 // When doing a partial unswitch, we have to do a bit more work to build up 2471 // the branch in the split block. 2472 if (PartiallyInvariant) 2473 buildPartialInvariantUnswitchConditionalBranch( 2474 *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); 2475 else { 2476 buildPartialUnswitchConditionalBranch( 2477 *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, 2478 FreezeLoopUnswitchCond, BI, &AC, DT); 2479 } 2480 DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); 2481 2482 if (MSSAU) { 2483 DT.applyUpdates(DTUpdates); 2484 DTUpdates.clear(); 2485 2486 // Perform MSSA cloning updates. 2487 for (auto &VMap : VMaps) 2488 MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, 2489 /*IgnoreIncomingWithNoClones=*/true); 2490 MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); 2491 } 2492 } 2493 2494 // Apply the updates accumulated above to get an up-to-date dominator tree. 2495 DT.applyUpdates(DTUpdates); 2496 2497 // Now that we have an accurate dominator tree, first delete the dead cloned 2498 // blocks so that we can accurately build any cloned loops. It is important to 2499 // not delete the blocks from the original loop yet because we still want to 2500 // reference the original loop to understand the cloned loop's structure. 2501 deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); 2502 2503 // Build the cloned loop structure itself. This may be substantially 2504 // different from the original structure due to the simplified CFG. This also 2505 // handles inserting all the cloned blocks into the correct loops. 2506 SmallVector<Loop *, 4> NonChildClonedLoops; 2507 for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) 2508 buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops); 2509 2510 // Now that our cloned loops have been built, we can update the original loop. 2511 // First we delete the dead blocks from it and then we rebuild the loop 2512 // structure taking these deletions into account. 2513 deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, SE, LoopUpdater); 2514 2515 if (MSSAU && VerifyMemorySSA) 2516 MSSAU->getMemorySSA()->verifyMemorySSA(); 2517 2518 SmallVector<Loop *, 4> HoistedLoops; 2519 bool IsStillLoop = 2520 rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops, SE); 2521 2522 if (MSSAU && VerifyMemorySSA) 2523 MSSAU->getMemorySSA()->verifyMemorySSA(); 2524 2525 // This transformation has a high risk of corrupting the dominator tree, and 2526 // the below steps to rebuild loop structures will result in hard to debug 2527 // errors in that case so verify that the dominator tree is sane first. 2528 // FIXME: Remove this when the bugs stop showing up and rely on existing 2529 // verification steps. 2530 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 2531 2532 if (BI && !PartiallyInvariant) { 2533 // If we unswitched a branch which collapses the condition to a known 2534 // constant we want to replace all the uses of the invariants within both 2535 // the original and cloned blocks. We do this here so that we can use the 2536 // now updated dominator tree to identify which side the users are on. 2537 assert(UnswitchedSuccBBs.size() == 1 && 2538 "Only one possible unswitched block for a branch!"); 2539 BasicBlock *ClonedPH = ClonedPHs.begin()->second; 2540 2541 // When considering multiple partially-unswitched invariants 2542 // we cant just go replace them with constants in both branches. 2543 // 2544 // For 'AND' we infer that true branch ("continue") means true 2545 // for each invariant operand. 2546 // For 'OR' we can infer that false branch ("continue") means false 2547 // for each invariant operand. 2548 // So it happens that for multiple-partial case we dont replace 2549 // in the unswitched branch. 2550 bool ReplaceUnswitched = 2551 FullUnswitch || (Invariants.size() == 1) || PartiallyInvariant; 2552 2553 ConstantInt *UnswitchedReplacement = 2554 Direction ? ConstantInt::getTrue(BI->getContext()) 2555 : ConstantInt::getFalse(BI->getContext()); 2556 ConstantInt *ContinueReplacement = 2557 Direction ? ConstantInt::getFalse(BI->getContext()) 2558 : ConstantInt::getTrue(BI->getContext()); 2559 for (Value *Invariant : Invariants) { 2560 assert(!isa<Constant>(Invariant) && 2561 "Should not be replacing constant values!"); 2562 // Use make_early_inc_range here as set invalidates the iterator. 2563 for (Use &U : llvm::make_early_inc_range(Invariant->uses())) { 2564 Instruction *UserI = dyn_cast<Instruction>(U.getUser()); 2565 if (!UserI) 2566 continue; 2567 2568 // Replace it with the 'continue' side if in the main loop body, and the 2569 // unswitched if in the cloned blocks. 2570 if (DT.dominates(LoopPH, UserI->getParent())) 2571 U.set(ContinueReplacement); 2572 else if (ReplaceUnswitched && 2573 DT.dominates(ClonedPH, UserI->getParent())) 2574 U.set(UnswitchedReplacement); 2575 } 2576 } 2577 } 2578 2579 // We can change which blocks are exit blocks of all the cloned sibling 2580 // loops, the current loop, and any parent loops which shared exit blocks 2581 // with the current loop. As a consequence, we need to re-form LCSSA for 2582 // them. But we shouldn't need to re-form LCSSA for any child loops. 2583 // FIXME: This could be made more efficient by tracking which exit blocks are 2584 // new, and focusing on them, but that isn't likely to be necessary. 2585 // 2586 // In order to reasonably rebuild LCSSA we need to walk inside-out across the 2587 // loop nest and update every loop that could have had its exits changed. We 2588 // also need to cover any intervening loops. We add all of these loops to 2589 // a list and sort them by loop depth to achieve this without updating 2590 // unnecessary loops. 2591 auto UpdateLoop = [&](Loop &UpdateL) { 2592 #ifndef NDEBUG 2593 UpdateL.verifyLoop(); 2594 for (Loop *ChildL : UpdateL) { 2595 ChildL->verifyLoop(); 2596 assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && 2597 "Perturbed a child loop's LCSSA form!"); 2598 } 2599 #endif 2600 // First build LCSSA for this loop so that we can preserve it when 2601 // forming dedicated exits. We don't want to perturb some other loop's 2602 // LCSSA while doing that CFG edit. 2603 formLCSSA(UpdateL, DT, &LI, SE); 2604 2605 // For loops reached by this loop's original exit blocks we may 2606 // introduced new, non-dedicated exits. At least try to re-form dedicated 2607 // exits for these loops. This may fail if they couldn't have dedicated 2608 // exits to start with. 2609 formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true); 2610 }; 2611 2612 // For non-child cloned loops and hoisted loops, we just need to update LCSSA 2613 // and we can do it in any order as they don't nest relative to each other. 2614 // 2615 // Also check if any of the loops we have updated have become top-level loops 2616 // as that will necessitate widening the outer loop scope. 2617 for (Loop *UpdatedL : 2618 llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) { 2619 UpdateLoop(*UpdatedL); 2620 if (UpdatedL->isOutermost()) 2621 OuterExitL = nullptr; 2622 } 2623 if (IsStillLoop) { 2624 UpdateLoop(L); 2625 if (L.isOutermost()) 2626 OuterExitL = nullptr; 2627 } 2628 2629 // If the original loop had exit blocks, walk up through the outer most loop 2630 // of those exit blocks to update LCSSA and form updated dedicated exits. 2631 if (OuterExitL != &L) 2632 for (Loop *OuterL = ParentL; OuterL != OuterExitL; 2633 OuterL = OuterL->getParentLoop()) 2634 UpdateLoop(*OuterL); 2635 2636 #ifndef NDEBUG 2637 // Verify the entire loop structure to catch any incorrect updates before we 2638 // progress in the pass pipeline. 2639 LI.verify(DT); 2640 #endif 2641 2642 // Now that we've unswitched something, make callbacks to report the changes. 2643 // For that we need to merge together the updated loops and the cloned loops 2644 // and check whether the original loop survived. 2645 SmallVector<Loop *, 4> SibLoops; 2646 for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) 2647 if (UpdatedL->getParentLoop() == ParentL) 2648 SibLoops.push_back(UpdatedL); 2649 postUnswitch(L, LoopUpdater, LoopName, IsStillLoop, PartiallyInvariant, 2650 InjectedCondition, SibLoops); 2651 2652 if (MSSAU && VerifyMemorySSA) 2653 MSSAU->getMemorySSA()->verifyMemorySSA(); 2654 2655 if (BI) 2656 ++NumBranches; 2657 else 2658 ++NumSwitches; 2659 } 2660 2661 /// Recursively compute the cost of a dominator subtree based on the per-block 2662 /// cost map provided. 2663 /// 2664 /// The recursive computation is memozied into the provided DT-indexed cost map 2665 /// to allow querying it for most nodes in the domtree without it becoming 2666 /// quadratic. 2667 static InstructionCost computeDomSubtreeCost( 2668 DomTreeNode &N, 2669 const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap, 2670 SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) { 2671 // Don't accumulate cost (or recurse through) blocks not in our block cost 2672 // map and thus not part of the duplication cost being considered. 2673 auto BBCostIt = BBCostMap.find(N.getBlock()); 2674 if (BBCostIt == BBCostMap.end()) 2675 return 0; 2676 2677 // Lookup this node to see if we already computed its cost. 2678 auto DTCostIt = DTCostMap.find(&N); 2679 if (DTCostIt != DTCostMap.end()) 2680 return DTCostIt->second; 2681 2682 // If not, we have to compute it. We can't use insert above and update 2683 // because computing the cost may insert more things into the map. 2684 InstructionCost Cost = std::accumulate( 2685 N.begin(), N.end(), BBCostIt->second, 2686 [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost { 2687 return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap); 2688 }); 2689 bool Inserted = DTCostMap.insert({&N, Cost}).second; 2690 (void)Inserted; 2691 assert(Inserted && "Should not insert a node while visiting children!"); 2692 return Cost; 2693 } 2694 2695 /// Turns a select instruction into implicit control flow branch, 2696 /// making the following replacement: 2697 /// 2698 /// head: 2699 /// --code before select-- 2700 /// select %cond, %trueval, %falseval 2701 /// --code after select-- 2702 /// 2703 /// into 2704 /// 2705 /// head: 2706 /// --code before select-- 2707 /// br i1 %cond, label %then, label %tail 2708 /// 2709 /// then: 2710 /// br %tail 2711 /// 2712 /// tail: 2713 /// phi [ %trueval, %then ], [ %falseval, %head] 2714 /// unreachable 2715 /// 2716 /// It also makes all relevant DT and LI updates, so that all structures are in 2717 /// valid state after this transform. 2718 static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, 2719 LoopInfo &LI, MemorySSAUpdater *MSSAU, 2720 AssumptionCache *AC) { 2721 LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n"); 2722 BasicBlock *HeadBB = SI->getParent(); 2723 2724 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 2725 SplitBlockAndInsertIfThen(SI->getCondition(), SI, false, 2726 SI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); 2727 auto *CondBr = cast<BranchInst>(HeadBB->getTerminator()); 2728 BasicBlock *ThenBB = CondBr->getSuccessor(0), 2729 *TailBB = CondBr->getSuccessor(1); 2730 if (MSSAU) 2731 MSSAU->moveAllAfterSpliceBlocks(HeadBB, TailBB, SI); 2732 2733 PHINode *Phi = 2734 PHINode::Create(SI->getType(), 2, "unswitched.select", SI->getIterator()); 2735 Phi->addIncoming(SI->getTrueValue(), ThenBB); 2736 Phi->addIncoming(SI->getFalseValue(), HeadBB); 2737 Phi->setDebugLoc(SI->getDebugLoc()); 2738 SI->replaceAllUsesWith(Phi); 2739 SI->eraseFromParent(); 2740 2741 if (MSSAU && VerifyMemorySSA) 2742 MSSAU->getMemorySSA()->verifyMemorySSA(); 2743 2744 ++NumSelects; 2745 return CondBr; 2746 } 2747 2748 /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, 2749 /// making the following replacement: 2750 /// 2751 /// --code before guard-- 2752 /// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] 2753 /// --code after guard-- 2754 /// 2755 /// into 2756 /// 2757 /// --code before guard-- 2758 /// br i1 %cond, label %guarded, label %deopt 2759 /// 2760 /// guarded: 2761 /// --code after guard-- 2762 /// 2763 /// deopt: 2764 /// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] 2765 /// unreachable 2766 /// 2767 /// It also makes all relevant DT and LI updates, so that all structures are in 2768 /// valid state after this transform. 2769 static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, 2770 DominatorTree &DT, LoopInfo &LI, 2771 MemorySSAUpdater *MSSAU) { 2772 LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); 2773 BasicBlock *CheckBB = GI->getParent(); 2774 2775 if (MSSAU && VerifyMemorySSA) 2776 MSSAU->getMemorySSA()->verifyMemorySSA(); 2777 2778 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 2779 Instruction *DeoptBlockTerm = 2780 SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, 2781 GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); 2782 BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); 2783 // SplitBlockAndInsertIfThen inserts control flow that branches to 2784 // DeoptBlockTerm if the condition is true. We want the opposite. 2785 CheckBI->swapSuccessors(); 2786 2787 BasicBlock *GuardedBlock = CheckBI->getSuccessor(0); 2788 GuardedBlock->setName("guarded"); 2789 CheckBI->getSuccessor(1)->setName("deopt"); 2790 BasicBlock *DeoptBlock = CheckBI->getSuccessor(1); 2791 2792 if (MSSAU) 2793 MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI); 2794 2795 GI->moveBefore(DeoptBlockTerm->getIterator()); 2796 GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); 2797 2798 if (MSSAU) { 2799 MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); 2800 MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); 2801 if (VerifyMemorySSA) 2802 MSSAU->getMemorySSA()->verifyMemorySSA(); 2803 } 2804 2805 if (VerifyLoopInfo) 2806 LI.verify(DT); 2807 ++NumGuards; 2808 return CheckBI; 2809 } 2810 2811 /// Cost multiplier is a way to limit potentially exponential behavior 2812 /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch 2813 /// candidates available. Also accounting for the number of "sibling" loops with 2814 /// the idea to account for previous unswitches that already happened on this 2815 /// cluster of loops. There was an attempt to keep this formula simple, 2816 /// just enough to limit the worst case behavior. Even if it is not that simple 2817 /// now it is still not an attempt to provide a detailed heuristic size 2818 /// prediction. 2819 /// 2820 /// TODO: Make a proper accounting of "explosion" effect for all kinds of 2821 /// unswitch candidates, making adequate predictions instead of wild guesses. 2822 /// That requires knowing not just the number of "remaining" candidates but 2823 /// also costs of unswitching for each of these candidates. 2824 static int CalculateUnswitchCostMultiplier( 2825 const Instruction &TI, const Loop &L, const LoopInfo &LI, 2826 const DominatorTree &DT, 2827 ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates) { 2828 2829 // Guards and other exiting conditions do not contribute to exponential 2830 // explosion as soon as they dominate the latch (otherwise there might be 2831 // another path to the latch remaining that does not allow to eliminate the 2832 // loop copy on unswitch). 2833 const BasicBlock *Latch = L.getLoopLatch(); 2834 const BasicBlock *CondBlock = TI.getParent(); 2835 if (DT.dominates(CondBlock, Latch) && 2836 (isGuard(&TI) || 2837 (TI.isTerminator() && 2838 llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { 2839 return L.contains(SuccBB); 2840 }) <= 1))) { 2841 NumCostMultiplierSkipped++; 2842 return 1; 2843 } 2844 2845 auto *ParentL = L.getParentLoop(); 2846 int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() 2847 : std::distance(LI.begin(), LI.end())); 2848 // Count amount of clones that all the candidates might cause during 2849 // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its 2850 // cases. 2851 int UnswitchedClones = 0; 2852 for (const auto &Candidate : UnswitchCandidates) { 2853 const Instruction *CI = Candidate.TI; 2854 const BasicBlock *CondBlock = CI->getParent(); 2855 bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); 2856 if (isa<SelectInst>(CI)) { 2857 UnswitchedClones++; 2858 continue; 2859 } 2860 if (isGuard(CI)) { 2861 if (!SkipExitingSuccessors) 2862 UnswitchedClones++; 2863 continue; 2864 } 2865 int NonExitingSuccessors = 2866 llvm::count_if(successors(CondBlock), 2867 [SkipExitingSuccessors, &L](const BasicBlock *SuccBB) { 2868 return !SkipExitingSuccessors || L.contains(SuccBB); 2869 }); 2870 UnswitchedClones += Log2_32(NonExitingSuccessors); 2871 } 2872 2873 // Ignore up to the "unscaled candidates" number of unswitch candidates 2874 // when calculating the power-of-two scaling of the cost. The main idea 2875 // with this control is to allow a small number of unswitches to happen 2876 // and rely more on siblings multiplier (see below) when the number 2877 // of candidates is small. 2878 unsigned ClonesPower = 2879 std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0); 2880 2881 // Allowing top-level loops to spread a bit more than nested ones. 2882 int SiblingsMultiplier = 2883 std::max((ParentL ? SiblingsCount 2884 : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), 2885 1); 2886 // Compute the cost multiplier in a way that won't overflow by saturating 2887 // at an upper bound. 2888 int CostMultiplier; 2889 if (ClonesPower > Log2_32(UnswitchThreshold) || 2890 SiblingsMultiplier > UnswitchThreshold) 2891 CostMultiplier = UnswitchThreshold; 2892 else 2893 CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), 2894 (int)UnswitchThreshold); 2895 2896 LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier 2897 << " (siblings " << SiblingsMultiplier << " * clones " 2898 << (1 << ClonesPower) << ")" 2899 << " for unswitch candidate: " << TI << "\n"); 2900 return CostMultiplier; 2901 } 2902 2903 static bool collectUnswitchCandidates( 2904 SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, 2905 IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, 2906 const Loop &L, const LoopInfo &LI, AAResults &AA, 2907 const MemorySSAUpdater *MSSAU) { 2908 assert(UnswitchCandidates.empty() && "Should be!"); 2909 2910 auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) { 2911 Cond = skipTrivialSelect(Cond); 2912 if (isa<Constant>(Cond)) 2913 return; 2914 if (L.isLoopInvariant(Cond)) { 2915 UnswitchCandidates.push_back({I, {Cond}}); 2916 return; 2917 } 2918 if (match(Cond, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) { 2919 TinyPtrVector<Value *> Invariants = 2920 collectHomogenousInstGraphLoopInvariants( 2921 L, *static_cast<Instruction *>(Cond), LI); 2922 if (!Invariants.empty()) 2923 UnswitchCandidates.push_back({I, std::move(Invariants)}); 2924 } 2925 }; 2926 2927 // Whether or not we should also collect guards in the loop. 2928 bool CollectGuards = false; 2929 if (UnswitchGuards) { 2930 auto *GuardDecl = Intrinsic::getDeclarationIfExists( 2931 L.getHeader()->getParent()->getParent(), Intrinsic::experimental_guard); 2932 if (GuardDecl && !GuardDecl->use_empty()) 2933 CollectGuards = true; 2934 } 2935 2936 for (auto *BB : L.blocks()) { 2937 if (LI.getLoopFor(BB) != &L) 2938 continue; 2939 2940 for (auto &I : *BB) { 2941 if (auto *SI = dyn_cast<SelectInst>(&I)) { 2942 auto *Cond = SI->getCondition(); 2943 // Do not unswitch vector selects and logical and/or selects 2944 if (Cond->getType()->isIntegerTy(1) && !SI->getType()->isIntegerTy(1)) 2945 AddUnswitchCandidatesForInst(SI, Cond); 2946 } else if (CollectGuards && isGuard(&I)) { 2947 auto *Cond = 2948 skipTrivialSelect(cast<IntrinsicInst>(&I)->getArgOperand(0)); 2949 // TODO: Support AND, OR conditions and partial unswitching. 2950 if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond)) 2951 UnswitchCandidates.push_back({&I, {Cond}}); 2952 } 2953 } 2954 2955 if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { 2956 // We can only consider fully loop-invariant switch conditions as we need 2957 // to completely eliminate the switch after unswitching. 2958 if (!isa<Constant>(SI->getCondition()) && 2959 L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor()) 2960 UnswitchCandidates.push_back({SI, {SI->getCondition()}}); 2961 continue; 2962 } 2963 2964 auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); 2965 if (!BI || !BI->isConditional() || 2966 BI->getSuccessor(0) == BI->getSuccessor(1)) 2967 continue; 2968 2969 AddUnswitchCandidatesForInst(BI, BI->getCondition()); 2970 } 2971 2972 if (MSSAU && !findOptionMDForLoop(&L, "llvm.loop.unswitch.partial.disable") && 2973 !any_of(UnswitchCandidates, [&L](auto &TerminatorAndInvariants) { 2974 return TerminatorAndInvariants.TI == L.getHeader()->getTerminator(); 2975 })) { 2976 MemorySSA *MSSA = MSSAU->getMemorySSA(); 2977 if (auto Info = hasPartialIVCondition(L, MSSAThreshold, *MSSA, AA)) { 2978 LLVM_DEBUG( 2979 dbgs() << "simple-loop-unswitch: Found partially invariant condition " 2980 << *Info->InstToDuplicate[0] << "\n"); 2981 PartialIVInfo = *Info; 2982 PartialIVCondBranch = L.getHeader()->getTerminator(); 2983 TinyPtrVector<Value *> ValsToDuplicate; 2984 llvm::append_range(ValsToDuplicate, Info->InstToDuplicate); 2985 UnswitchCandidates.push_back( 2986 {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); 2987 } 2988 } 2989 return !UnswitchCandidates.empty(); 2990 } 2991 2992 /// Tries to canonicalize condition described by: 2993 /// 2994 /// br (LHS pred RHS), label IfTrue, label IfFalse 2995 /// 2996 /// into its equivalent where `Pred` is something that we support for injected 2997 /// invariants (so far it is limited to ult), LHS in canonicalized form is 2998 /// non-invariant and RHS is an invariant. 2999 static void canonicalizeForInvariantConditionInjection(CmpPredicate &Pred, 3000 Value *&LHS, Value *&RHS, 3001 BasicBlock *&IfTrue, 3002 BasicBlock *&IfFalse, 3003 const Loop &L) { 3004 if (!L.contains(IfTrue)) { 3005 Pred = ICmpInst::getInversePredicate(Pred); 3006 std::swap(IfTrue, IfFalse); 3007 } 3008 3009 // Move loop-invariant argument to RHS position. 3010 if (L.isLoopInvariant(LHS)) { 3011 Pred = ICmpInst::getSwappedPredicate(Pred); 3012 std::swap(LHS, RHS); 3013 } 3014 3015 if (Pred == ICmpInst::ICMP_SGE && match(RHS, m_Zero())) { 3016 // Turn "x >=s 0" into "x <u UMIN_INT" 3017 Pred = ICmpInst::ICMP_ULT; 3018 RHS = ConstantInt::get( 3019 RHS->getContext(), 3020 APInt::getSignedMinValue(RHS->getType()->getIntegerBitWidth())); 3021 } 3022 } 3023 3024 /// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) 3025 /// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by 3026 /// injecting a loop-invariant condition. 3027 static bool shouldTryInjectInvariantCondition( 3028 const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, 3029 const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { 3030 if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS)) 3031 return false; 3032 // TODO: Support other predicates. 3033 if (Pred != ICmpInst::ICMP_ULT) 3034 return false; 3035 // TODO: Support non-loop-exiting branches? 3036 if (!L.contains(IfTrue) || L.contains(IfFalse)) 3037 return false; 3038 // FIXME: For some reason this causes problems with MSSA updates, need to 3039 // investigate why. So far, just don't unswitch latch. 3040 if (L.getHeader() == IfTrue) 3041 return false; 3042 return true; 3043 } 3044 3045 /// Returns true, if metadata on \p BI allows us to optimize branching into \p 3046 /// TakenSucc via injection of invariant conditions. The branch should be not 3047 /// enough and not previously unswitched, the information about this comes from 3048 /// the metadata. 3049 bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, 3050 const BasicBlock *TakenSucc) { 3051 SmallVector<uint32_t> Weights; 3052 if (!extractBranchWeights(*BI, Weights)) 3053 return false; 3054 unsigned T = InjectInvariantConditionHotnesThreshold; 3055 BranchProbability LikelyTaken(T - 1, T); 3056 3057 assert(Weights.size() == 2 && "Unexpected profile data!"); 3058 size_t Idx = BI->getSuccessor(0) == TakenSucc ? 0 : 1; 3059 auto Num = Weights[Idx]; 3060 auto Denom = Weights[0] + Weights[1]; 3061 // Degenerate or overflowed metadata. 3062 if (Denom == 0 || Num > Denom) 3063 return false; 3064 BranchProbability ActualTaken(Num, Denom); 3065 if (LikelyTaken > ActualTaken) 3066 return false; 3067 return true; 3068 } 3069 3070 /// Materialize pending invariant condition of the given candidate into IR. The 3071 /// injected loop-invariant condition implies the original loop-variant branch 3072 /// condition, so the materialization turns 3073 /// 3074 /// loop_block: 3075 /// ... 3076 /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc 3077 /// 3078 /// into 3079 /// 3080 /// preheader: 3081 /// %invariant_cond = LHS pred RHS 3082 /// ... 3083 /// loop_block: 3084 /// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck 3085 /// OriginalCheck: 3086 /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc 3087 /// ... 3088 static NonTrivialUnswitchCandidate 3089 injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, 3090 DominatorTree &DT, LoopInfo &LI, 3091 AssumptionCache &AC, MemorySSAUpdater *MSSAU) { 3092 assert(Candidate.hasPendingInjection() && "Nothing to inject!"); 3093 BasicBlock *Preheader = L.getLoopPreheader(); 3094 assert(Preheader && "Loop is not in simplified form?"); 3095 assert(LI.getLoopFor(Candidate.TI->getParent()) == &L && 3096 "Unswitching branch of inner loop!"); 3097 3098 auto Pred = Candidate.PendingInjection->Pred; 3099 auto *LHS = Candidate.PendingInjection->LHS; 3100 auto *RHS = Candidate.PendingInjection->RHS; 3101 auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; 3102 auto *TI = cast<BranchInst>(Candidate.TI); 3103 auto *BB = Candidate.TI->getParent(); 3104 auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(0) ? TI->getSuccessor(1) 3105 : TI->getSuccessor(0); 3106 // FIXME: Remove this once limitation on successors is lifted. 3107 assert(L.contains(InLoopSucc) && "Not supported yet!"); 3108 assert(!L.contains(OutOfLoopSucc) && "Not supported yet!"); 3109 auto &Ctx = BB->getContext(); 3110 3111 IRBuilder<> Builder(Preheader->getTerminator()); 3112 assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!"); 3113 if (LHS->getType() != RHS->getType()) { 3114 if (LHS->getType()->getIntegerBitWidth() < 3115 RHS->getType()->getIntegerBitWidth()) 3116 LHS = Builder.CreateZExt(LHS, RHS->getType(), LHS->getName() + ".wide"); 3117 else 3118 RHS = Builder.CreateZExt(RHS, LHS->getType(), RHS->getName() + ".wide"); 3119 } 3120 // Do not use builder here: CreateICmp may simplify this into a constant and 3121 // unswitching will break. Better optimize it away later. 3122 auto *InjectedCond = 3123 ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond", 3124 Preheader->getTerminator()->getIterator()); 3125 3126 BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check", 3127 BB->getParent(), InLoopSucc); 3128 Builder.SetInsertPoint(TI); 3129 auto *InvariantBr = 3130 Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); 3131 3132 Builder.SetInsertPoint(CheckBlock); 3133 Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), 3134 TI->getSuccessor(1)); 3135 TI->eraseFromParent(); 3136 3137 // Fixup phis. 3138 for (auto &I : *InLoopSucc) { 3139 auto *PN = dyn_cast<PHINode>(&I); 3140 if (!PN) 3141 break; 3142 auto *Inc = PN->getIncomingValueForBlock(BB); 3143 PN->addIncoming(Inc, CheckBlock); 3144 } 3145 OutOfLoopSucc->replacePhiUsesWith(BB, CheckBlock); 3146 3147 SmallVector<DominatorTree::UpdateType, 4> DTUpdates = { 3148 { DominatorTree::Insert, BB, CheckBlock }, 3149 { DominatorTree::Insert, CheckBlock, InLoopSucc }, 3150 { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, 3151 { DominatorTree::Delete, BB, OutOfLoopSucc } 3152 }; 3153 3154 DT.applyUpdates(DTUpdates); 3155 if (MSSAU) 3156 MSSAU->applyUpdates(DTUpdates, DT); 3157 L.addBasicBlockToLoop(CheckBlock, LI); 3158 3159 #ifndef NDEBUG 3160 DT.verify(); 3161 LI.verify(DT); 3162 if (MSSAU && VerifyMemorySSA) 3163 MSSAU->getMemorySSA()->verifyMemorySSA(); 3164 #endif 3165 3166 // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* 3167 // higher because we have just inserted a new block. Need to think how to 3168 // adjust the cost of injected candidates when it was first computed. 3169 LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr 3170 << " and considering it for unswitching."); 3171 ++NumInvariantConditionsInjected; 3172 return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, 3173 Candidate.Cost); 3174 } 3175 3176 /// Given chain of loop branch conditions looking like: 3177 /// br (Variant < Invariant1) 3178 /// br (Variant < Invariant2) 3179 /// br (Variant < Invariant3) 3180 /// ... 3181 /// collect set of invariant conditions on which we want to unswitch, which 3182 /// look like: 3183 /// Invariant1 <= Invariant2 3184 /// Invariant2 <= Invariant3 3185 /// ... 3186 /// Though they might not immediately exist in the IR, we can still inject them. 3187 static bool insertCandidatesWithPendingInjections( 3188 SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L, 3189 ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares, 3190 const DominatorTree &DT) { 3191 3192 assert(ICmpInst::isRelational(Pred)); 3193 assert(ICmpInst::isStrictPredicate(Pred)); 3194 if (Compares.size() < 2) 3195 return false; 3196 ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(Pred); 3197 for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; 3198 Next != Compares.end(); ++Prev, ++Next) { 3199 Value *LHS = Next->Invariant; 3200 Value *RHS = Prev->Invariant; 3201 BasicBlock *InLoopSucc = Prev->InLoopSucc; 3202 InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); 3203 NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, 3204 std::nullopt, std::move(ToInject)); 3205 UnswitchCandidates.push_back(std::move(Candidate)); 3206 } 3207 return true; 3208 } 3209 3210 /// Collect unswitch candidates by invariant conditions that are not immediately 3211 /// present in the loop. However, they can be injected into the code if we 3212 /// decide it's profitable. 3213 /// An example of such conditions is following: 3214 /// 3215 /// for (...) { 3216 /// x = load ... 3217 /// if (! x <u C1) break; 3218 /// if (! x <u C2) break; 3219 /// <do something> 3220 /// } 3221 /// 3222 /// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <= 3223 /// C2" automatically implies "x <u C2", so we can get rid of one of 3224 /// loop-variant checks in unswitched loop version. 3225 static bool collectUnswitchCandidatesWithInjections( 3226 SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, 3227 IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, 3228 const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, 3229 const MemorySSAUpdater *MSSAU) { 3230 if (!InjectInvariantConditions) 3231 return false; 3232 3233 if (!DT.isReachableFromEntry(L.getHeader())) 3234 return false; 3235 auto *Latch = L.getLoopLatch(); 3236 // Need to have a single latch and a preheader. 3237 if (!Latch) 3238 return false; 3239 assert(L.getLoopPreheader() && "Must have a preheader!"); 3240 3241 DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT; 3242 // Traverse the conditions that dominate latch (and therefore dominate each 3243 // other). 3244 for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock()); 3245 DTN = DTN->getIDom()) { 3246 CmpPredicate Pred; 3247 Value *LHS = nullptr, *RHS = nullptr; 3248 BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; 3249 auto *BB = DTN->getBlock(); 3250 // Ignore inner loops. 3251 if (LI.getLoopFor(BB) != &L) 3252 continue; 3253 auto *Term = BB->getTerminator(); 3254 if (!match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), 3255 m_BasicBlock(IfTrue), m_BasicBlock(IfFalse)))) 3256 continue; 3257 if (!LHS->getType()->isIntegerTy()) 3258 continue; 3259 canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse, 3260 L); 3261 if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) 3262 continue; 3263 if (!shouldTryInjectBasingOnMetadata(cast<BranchInst>(Term), IfTrue)) 3264 continue; 3265 // Strip ZEXT for unsigned predicate. 3266 // TODO: once signed predicates are supported, also strip SEXT. 3267 CompareDesc Desc(cast<BranchInst>(Term), RHS, IfTrue); 3268 while (auto *Zext = dyn_cast<ZExtInst>(LHS)) 3269 LHS = Zext->getOperand(0); 3270 CandidatesULT[LHS].push_back(Desc); 3271 } 3272 3273 bool Found = false; 3274 for (auto &It : CandidatesULT) 3275 Found |= insertCandidatesWithPendingInjections( 3276 UnswitchCandidates, L, ICmpInst::ICMP_ULT, It.second, DT); 3277 return Found; 3278 } 3279 3280 static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { 3281 if (!L.isSafeToClone()) 3282 return false; 3283 for (auto *BB : L.blocks()) 3284 for (auto &I : *BB) { 3285 if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) 3286 return false; 3287 if (auto *CB = dyn_cast<CallBase>(&I)) { 3288 assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()."); 3289 if (CB->isConvergent()) 3290 return false; 3291 } 3292 } 3293 3294 // Check if there are irreducible CFG cycles in this loop. If so, we cannot 3295 // easily unswitch non-trivial edges out of the loop. Doing so might turn the 3296 // irreducible control flow into reducible control flow and introduce new 3297 // loops "out of thin air". If we ever discover important use cases for doing 3298 // this, we can add support to loop unswitch, but it is a lot of complexity 3299 // for what seems little or no real world benefit. 3300 LoopBlocksRPO RPOT(&L); 3301 RPOT.perform(&LI); 3302 if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI)) 3303 return false; 3304 3305 SmallVector<BasicBlock *, 4> ExitBlocks; 3306 L.getUniqueExitBlocks(ExitBlocks); 3307 // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch 3308 // instruction as we don't know how to split those exit blocks. 3309 // FIXME: We should teach SplitBlock to handle this and remove this 3310 // restriction. 3311 for (auto *ExitBB : ExitBlocks) { 3312 auto It = ExitBB->getFirstNonPHIIt(); 3313 if (isa<CleanupPadInst>(It) || isa<CatchSwitchInst>(It)) { 3314 LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch " 3315 "in exit block\n"); 3316 return false; 3317 } 3318 } 3319 3320 return true; 3321 } 3322 3323 static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( 3324 ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates, const Loop &L, 3325 const DominatorTree &DT, const LoopInfo &LI, AssumptionCache &AC, 3326 const TargetTransformInfo &TTI, const IVConditionInfo &PartialIVInfo) { 3327 // Given that unswitching these terminators will require duplicating parts of 3328 // the loop, so we need to be able to model that cost. Compute the ephemeral 3329 // values and set up a data structure to hold per-BB costs. We cache each 3330 // block's cost so that we don't recompute this when considering different 3331 // subsets of the loop for duplication during unswitching. 3332 SmallPtrSet<const Value *, 4> EphValues; 3333 CodeMetrics::collectEphemeralValues(&L, &AC, EphValues); 3334 SmallDenseMap<BasicBlock *, InstructionCost, 4> BBCostMap; 3335 3336 // Compute the cost of each block, as well as the total loop cost. Also, bail 3337 // out if we see instructions which are incompatible with loop unswitching 3338 // (convergent, noduplicate, or cross-basic-block tokens). 3339 // FIXME: We might be able to safely handle some of these in non-duplicated 3340 // regions. 3341 TargetTransformInfo::TargetCostKind CostKind = 3342 L.getHeader()->getParent()->hasMinSize() 3343 ? TargetTransformInfo::TCK_CodeSize 3344 : TargetTransformInfo::TCK_SizeAndLatency; 3345 InstructionCost LoopCost = 0; 3346 for (auto *BB : L.blocks()) { 3347 InstructionCost Cost = 0; 3348 for (auto &I : *BB) { 3349 if (EphValues.count(&I)) 3350 continue; 3351 Cost += TTI.getInstructionCost(&I, CostKind); 3352 } 3353 assert(Cost >= 0 && "Must not have negative costs!"); 3354 LoopCost += Cost; 3355 assert(LoopCost >= 0 && "Must not have negative loop costs!"); 3356 BBCostMap[BB] = Cost; 3357 } 3358 LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); 3359 3360 // Now we find the best candidate by searching for the one with the following 3361 // properties in order: 3362 // 3363 // 1) An unswitching cost below the threshold 3364 // 2) The smallest number of duplicated unswitch candidates (to avoid 3365 // creating redundant subsequent unswitching) 3366 // 3) The smallest cost after unswitching. 3367 // 3368 // We prioritize reducing fanout of unswitch candidates provided the cost 3369 // remains below the threshold because this has a multiplicative effect. 3370 // 3371 // This requires memoizing each dominator subtree to avoid redundant work. 3372 // 3373 // FIXME: Need to actually do the number of candidates part above. 3374 SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap; 3375 // Given a terminator which might be unswitched, computes the non-duplicated 3376 // cost for that terminator. 3377 auto ComputeUnswitchedCost = [&](Instruction &TI, 3378 bool FullUnswitch) -> InstructionCost { 3379 // Unswitching selects unswitches the entire loop. 3380 if (isa<SelectInst>(TI)) 3381 return LoopCost; 3382 3383 BasicBlock &BB = *TI.getParent(); 3384 SmallPtrSet<BasicBlock *, 4> Visited; 3385 3386 InstructionCost Cost = 0; 3387 for (BasicBlock *SuccBB : successors(&BB)) { 3388 // Don't count successors more than once. 3389 if (!Visited.insert(SuccBB).second) 3390 continue; 3391 3392 // If this is a partial unswitch candidate, then it must be a conditional 3393 // branch with a condition of either `or`, `and`, their corresponding 3394 // select forms or partially invariant instructions. In that case, one of 3395 // the successors is necessarily duplicated, so don't even try to remove 3396 // its cost. 3397 if (!FullUnswitch) { 3398 auto &BI = cast<BranchInst>(TI); 3399 Value *Cond = skipTrivialSelect(BI.getCondition()); 3400 if (match(Cond, m_LogicalAnd())) { 3401 if (SuccBB == BI.getSuccessor(1)) 3402 continue; 3403 } else if (match(Cond, m_LogicalOr())) { 3404 if (SuccBB == BI.getSuccessor(0)) 3405 continue; 3406 } else if ((PartialIVInfo.KnownValue->isOneValue() && 3407 SuccBB == BI.getSuccessor(0)) || 3408 (!PartialIVInfo.KnownValue->isOneValue() && 3409 SuccBB == BI.getSuccessor(1))) 3410 continue; 3411 } 3412 3413 // This successor's domtree will not need to be duplicated after 3414 // unswitching if the edge to the successor dominates it (and thus the 3415 // entire tree). This essentially means there is no other path into this 3416 // subtree and so it will end up live in only one clone of the loop. 3417 if (SuccBB->getUniquePredecessor() || 3418 llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { 3419 return PredBB == &BB || DT.dominates(SuccBB, PredBB); 3420 })) { 3421 Cost += computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap); 3422 assert(Cost <= LoopCost && 3423 "Non-duplicated cost should never exceed total loop cost!"); 3424 } 3425 } 3426 3427 // Now scale the cost by the number of unique successors minus one. We 3428 // subtract one because there is already at least one copy of the entire 3429 // loop. This is computing the new cost of unswitching a condition. 3430 // Note that guards always have 2 unique successors that are implicit and 3431 // will be materialized if we decide to unswitch it. 3432 int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size(); 3433 assert(SuccessorsCount > 1 && 3434 "Cannot unswitch a condition without multiple distinct successors!"); 3435 return (LoopCost - Cost) * (SuccessorsCount - 1); 3436 }; 3437 3438 std::optional<NonTrivialUnswitchCandidate> Best; 3439 for (auto &Candidate : UnswitchCandidates) { 3440 Instruction &TI = *Candidate.TI; 3441 ArrayRef<Value *> Invariants = Candidate.Invariants; 3442 BranchInst *BI = dyn_cast<BranchInst>(&TI); 3443 bool FullUnswitch = 3444 !BI || Candidate.hasPendingInjection() || 3445 (Invariants.size() == 1 && 3446 Invariants[0] == skipTrivialSelect(BI->getCondition())); 3447 InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); 3448 // Calculate cost multiplier which is a tool to limit potentially 3449 // exponential behavior of loop-unswitch. 3450 if (EnableUnswitchCostMultiplier) { 3451 int CostMultiplier = 3452 CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); 3453 assert( 3454 (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && 3455 "cost multiplier needs to be in the range of 1..UnswitchThreshold"); 3456 CandidateCost *= CostMultiplier; 3457 LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost 3458 << " (multiplier: " << CostMultiplier << ")" 3459 << " for unswitch candidate: " << TI << "\n"); 3460 } else { 3461 LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost 3462 << " for unswitch candidate: " << TI << "\n"); 3463 } 3464 3465 if (!Best || CandidateCost < Best->Cost) { 3466 Best = Candidate; 3467 Best->Cost = CandidateCost; 3468 } 3469 } 3470 assert(Best && "Must be!"); 3471 return *Best; 3472 } 3473 3474 // Insert a freeze on an unswitched branch if all is true: 3475 // 1. freeze-loop-unswitch-cond option is true 3476 // 2. The branch may not execute in the loop pre-transformation. If a branch may 3477 // not execute and could cause UB, it would always cause UB if it is hoisted outside 3478 // of the loop. Insert a freeze to prevent this case. 3479 // 3. The branch condition may be poison or undef 3480 static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, 3481 AssumptionCache &AC) { 3482 assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI)); 3483 if (!FreezeLoopUnswitchCond) 3484 return false; 3485 3486 ICFLoopSafetyInfo SafetyInfo; 3487 SafetyInfo.computeLoopSafetyInfo(&L); 3488 if (SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) 3489 return false; 3490 3491 Value *Cond; 3492 if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) 3493 Cond = skipTrivialSelect(BI->getCondition()); 3494 else 3495 Cond = skipTrivialSelect(cast<SwitchInst>(&TI)->getCondition()); 3496 return !isGuaranteedNotToBeUndefOrPoison( 3497 Cond, &AC, L.getLoopPreheader()->getTerminator(), &DT); 3498 } 3499 3500 static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, 3501 AssumptionCache &AC, AAResults &AA, 3502 TargetTransformInfo &TTI, ScalarEvolution *SE, 3503 MemorySSAUpdater *MSSAU, 3504 LPMUpdater &LoopUpdater) { 3505 // Collect all invariant conditions within this loop (as opposed to an inner 3506 // loop which would be handled when visiting that inner loop). 3507 SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; 3508 IVConditionInfo PartialIVInfo; 3509 Instruction *PartialIVCondBranch = nullptr; 3510 collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, 3511 PartialIVCondBranch, L, LI, AA, MSSAU); 3512 if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable")) 3513 collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, 3514 PartialIVCondBranch, L, DT, LI, AA, 3515 MSSAU); 3516 // If we didn't find any candidates, we're done. 3517 if (UnswitchCandidates.empty()) 3518 return false; 3519 3520 LLVM_DEBUG( 3521 dbgs() << "Considering " << UnswitchCandidates.size() 3522 << " non-trivial loop invariant conditions for unswitching.\n"); 3523 3524 NonTrivialUnswitchCandidate Best = findBestNonTrivialUnswitchCandidate( 3525 UnswitchCandidates, L, DT, LI, AC, TTI, PartialIVInfo); 3526 3527 assert(Best.TI && "Failed to find loop unswitch candidate"); 3528 assert(Best.Cost && "Failed to compute cost"); 3529 3530 if (*Best.Cost >= UnswitchThreshold) { 3531 LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << *Best.Cost 3532 << "\n"); 3533 return false; 3534 } 3535 3536 bool InjectedCondition = false; 3537 if (Best.hasPendingInjection()) { 3538 Best = injectPendingInvariantConditions(Best, L, DT, LI, AC, MSSAU); 3539 InjectedCondition = true; 3540 } 3541 assert(!Best.hasPendingInjection() && 3542 "All injections should have been done by now!"); 3543 3544 if (Best.TI != PartialIVCondBranch) 3545 PartialIVInfo.InstToDuplicate.clear(); 3546 3547 bool InsertFreeze; 3548 if (auto *SI = dyn_cast<SelectInst>(Best.TI)) { 3549 // If the best candidate is a select, turn it into a branch. Select 3550 // instructions with a poison conditional do not propagate poison, but 3551 // branching on poison causes UB. Insert a freeze on the select 3552 // conditional to prevent UB after turning the select into a branch. 3553 InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( 3554 SI->getCondition(), &AC, L.getLoopPreheader()->getTerminator(), &DT); 3555 Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, &AC); 3556 } else { 3557 // If the best candidate is a guard, turn it into a branch. 3558 if (isGuard(Best.TI)) 3559 Best.TI = 3560 turnGuardIntoBranch(cast<IntrinsicInst>(Best.TI), L, DT, LI, MSSAU); 3561 InsertFreeze = shouldInsertFreeze(L, *Best.TI, DT, AC); 3562 } 3563 3564 LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost 3565 << ") terminator: " << *Best.TI << "\n"); 3566 unswitchNontrivialInvariants(L, *Best.TI, Best.Invariants, PartialIVInfo, DT, 3567 LI, AC, SE, MSSAU, LoopUpdater, InsertFreeze, 3568 InjectedCondition); 3569 return true; 3570 } 3571 3572 /// Unswitch control flow predicated on loop invariant conditions. 3573 /// 3574 /// This first hoists all branches or switches which are trivial (IE, do not 3575 /// require duplicating any part of the loop) out of the loop body. It then 3576 /// looks at other loop invariant control flows and tries to unswitch those as 3577 /// well by cloning the loop if the result is small enough. 3578 /// 3579 /// The `DT`, `LI`, `AC`, `AA`, `TTI` parameters are required analyses that are 3580 /// also updated based on the unswitch. The `MSSA` analysis is also updated if 3581 /// valid (i.e. its use is enabled). 3582 /// 3583 /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is 3584 /// true, we will attempt to do non-trivial unswitching as well as trivial 3585 /// unswitching. 3586 /// 3587 /// The `postUnswitch` function will be run after unswitching is complete 3588 /// with information on whether or not the provided loop remains a loop and 3589 /// a list of new sibling loops created. 3590 /// 3591 /// If `SE` is non-null, we will update that analysis based on the unswitching 3592 /// done. 3593 static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, 3594 AssumptionCache &AC, AAResults &AA, 3595 TargetTransformInfo &TTI, bool Trivial, 3596 bool NonTrivial, ScalarEvolution *SE, 3597 MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, 3598 BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { 3599 assert(L.isRecursivelyLCSSAForm(DT, LI) && 3600 "Loops must be in LCSSA form before unswitching."); 3601 3602 // Must be in loop simplified form: we need a preheader and dedicated exits. 3603 if (!L.isLoopSimplifyForm()) 3604 return false; 3605 3606 // Try trivial unswitch first before loop over other basic blocks in the loop. 3607 if (Trivial && unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { 3608 // If we unswitched successfully we will want to clean up the loop before 3609 // processing it further so just mark it as unswitched and return. 3610 postUnswitch(L, LoopUpdater, L.getName(), 3611 /*CurrentLoopValid*/ true, /*PartiallyInvariant*/ false, 3612 /*InjectedCondition*/ false, {}); 3613 return true; 3614 } 3615 3616 const Function *F = L.getHeader()->getParent(); 3617 3618 // Check whether we should continue with non-trivial conditions. 3619 // EnableNonTrivialUnswitch: Global variable that forces non-trivial 3620 // unswitching for testing and debugging. 3621 // NonTrivial: Parameter that enables non-trivial unswitching for this 3622 // invocation of the transform. But this should be allowed only 3623 // for targets without branch divergence. 3624 // 3625 // FIXME: If divergence analysis becomes available to a loop 3626 // transform, we should allow unswitching for non-trivial uniform 3627 // branches even on targets that have divergence. 3628 // https://bugs.llvm.org/show_bug.cgi?id=48819 3629 bool ContinueWithNonTrivial = 3630 EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); 3631 if (!ContinueWithNonTrivial) 3632 return false; 3633 3634 // Skip non-trivial unswitching for optsize functions. 3635 if (F->hasOptSize()) 3636 return false; 3637 3638 // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, 3639 // of the loops L is nested in, and of the loops nested in L are all cold. 3640 auto IsLoopNestCold = [&](const Loop *L) { 3641 // Check L and all of its parent loops. 3642 auto *Parent = L; 3643 while (Parent) { 3644 if (!PSI->isColdBlock(Parent->getHeader(), BFI)) 3645 return false; 3646 Parent = Parent->getParentLoop(); 3647 } 3648 // Next check all loops nested within L. 3649 SmallVector<const Loop *, 4> Worklist; 3650 llvm::append_range(Worklist, L->getSubLoops()); 3651 while (!Worklist.empty()) { 3652 auto *CurLoop = Worklist.pop_back_val(); 3653 if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) 3654 return false; 3655 llvm::append_range(Worklist, CurLoop->getSubLoops()); 3656 } 3657 return true; 3658 }; 3659 3660 // Skip cold loops in cold loop nests, as unswitching them brings little 3661 // benefit but increases the code size 3662 if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { 3663 LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); 3664 return false; 3665 } 3666 3667 // Perform legality checks. 3668 if (!isSafeForNoNTrivialUnswitching(L, LI)) 3669 return false; 3670 3671 // For non-trivial unswitching, because it often creates new loops, we rely on 3672 // the pass manager to iterate on the loops rather than trying to immediately 3673 // reach a fixed point. There is no substantial advantage to iterating 3674 // internally, and if any of the new loops are simplified enough to contain 3675 // trivial unswitching we want to prefer those. 3676 3677 // Try to unswitch the best invariant condition. We prefer this full unswitch to 3678 // a partial unswitch when possible below the threshold. 3679 if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, MSSAU, LoopUpdater)) 3680 return true; 3681 3682 // No other opportunities to unswitch. 3683 return false; 3684 } 3685 3686 PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, 3687 LoopStandardAnalysisResults &AR, 3688 LPMUpdater &U) { 3689 Function &F = *L.getHeader()->getParent(); 3690 (void)F; 3691 ProfileSummaryInfo *PSI = nullptr; 3692 if (auto OuterProxy = 3693 AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR) 3694 .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F)) 3695 PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 3696 LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L 3697 << "\n"); 3698 3699 std::optional<MemorySSAUpdater> MSSAU; 3700 if (AR.MSSA) { 3701 MSSAU = MemorySSAUpdater(AR.MSSA); 3702 if (VerifyMemorySSA) 3703 AR.MSSA->verifyMemorySSA(); 3704 } 3705 if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, 3706 &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) 3707 return PreservedAnalyses::all(); 3708 3709 if (AR.MSSA && VerifyMemorySSA) 3710 AR.MSSA->verifyMemorySSA(); 3711 3712 // Historically this pass has had issues with the dominator tree so verify it 3713 // in asserts builds. 3714 assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); 3715 3716 auto PA = getLoopPassPreservedAnalyses(); 3717 if (AR.MSSA) 3718 PA.preserve<MemorySSAAnalysis>(); 3719 return PA; 3720 } 3721 3722 void SimpleLoopUnswitchPass::printPipeline( 3723 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 3724 static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( 3725 OS, MapClassName2PassName); 3726 3727 OS << '<'; 3728 OS << (NonTrivial ? "" : "no-") << "nontrivial;"; 3729 OS << (Trivial ? "" : "no-") << "trivial"; 3730 OS << '>'; 3731 } 3732