1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Loop Distribution Pass. Its main focus is to 10 // distribute loops that cannot be vectorized due to dependence cycles. It 11 // tries to isolate the offending dependences into a new loop allowing 12 // vectorization of the remaining parts. 13 // 14 // For dependence analysis, the pass uses the LoopVectorizer's 15 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 16 // memory operations, special care is taken to preserve the lexical order of 17 // these operations. 18 // 19 // Similarly to the Vectorizer, the pass also supports loop versioning to 20 // run-time disambiguate potentially overlapping arrays. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/Scalar/LoopDistribute.h" 25 #include "llvm/ADT/DenseMap.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/SmallPtrSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/StringRef.h" 33 #include "llvm/ADT/Twine.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/GlobalsModRef.h" 37 #include "llvm/Analysis/LoopAccessAnalysis.h" 38 #include "llvm/Analysis/LoopAnalysisManager.h" 39 #include "llvm/Analysis/LoopInfo.h" 40 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 41 #include "llvm/Analysis/ScalarEvolution.h" 42 #include "llvm/Analysis/TargetLibraryInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constants.h" 46 #include "llvm/IR/DiagnosticInfo.h" 47 #include "llvm/IR/Dominators.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/Instruction.h" 50 #include "llvm/IR/Instructions.h" 51 #include "llvm/IR/LLVMContext.h" 52 #include "llvm/IR/Metadata.h" 53 #include "llvm/IR/PassManager.h" 54 #include "llvm/IR/Value.h" 55 #include "llvm/InitializePasses.h" 56 #include "llvm/Pass.h" 57 #include "llvm/Support/Casting.h" 58 #include "llvm/Support/CommandLine.h" 59 #include "llvm/Support/Debug.h" 60 #include "llvm/Support/raw_ostream.h" 61 #include "llvm/Transforms/Scalar.h" 62 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 63 #include "llvm/Transforms/Utils/Cloning.h" 64 #include "llvm/Transforms/Utils/LoopUtils.h" 65 #include "llvm/Transforms/Utils/LoopVersioning.h" 66 #include "llvm/Transforms/Utils/ValueMapper.h" 67 #include <cassert> 68 #include <functional> 69 #include <list> 70 #include <tuple> 71 #include <utility> 72 73 using namespace llvm; 74 75 #define LDIST_NAME "loop-distribute" 76 #define DEBUG_TYPE LDIST_NAME 77 78 /// @{ 79 /// Metadata attribute names 80 static const char *const LLVMLoopDistributeFollowupAll = 81 "llvm.loop.distribute.followup_all"; 82 static const char *const LLVMLoopDistributeFollowupCoincident = 83 "llvm.loop.distribute.followup_coincident"; 84 static const char *const LLVMLoopDistributeFollowupSequential = 85 "llvm.loop.distribute.followup_sequential"; 86 static const char *const LLVMLoopDistributeFollowupFallback = 87 "llvm.loop.distribute.followup_fallback"; 88 /// @} 89 90 static cl::opt<bool> 91 LDistVerify("loop-distribute-verify", cl::Hidden, 92 cl::desc("Turn on DominatorTree and LoopInfo verification " 93 "after Loop Distribution"), 94 cl::init(false)); 95 96 static cl::opt<bool> DistributeNonIfConvertible( 97 "loop-distribute-non-if-convertible", cl::Hidden, 98 cl::desc("Whether to distribute into a loop that may not be " 99 "if-convertible by the loop vectorizer"), 100 cl::init(false)); 101 102 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 103 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 104 cl::desc("The maximum number of SCEV checks allowed for Loop " 105 "Distribution")); 106 107 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 108 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 109 cl::Hidden, 110 cl::desc( 111 "The maximum number of SCEV checks allowed for Loop " 112 "Distribution for loop marked with #pragma loop distribute(enable)")); 113 114 static cl::opt<bool> EnableLoopDistribute( 115 "enable-loop-distribute", cl::Hidden, 116 cl::desc("Enable the new, experimental LoopDistribution Pass"), 117 cl::init(false)); 118 119 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 120 121 namespace { 122 123 /// Maintains the set of instructions of the loop for a partition before 124 /// cloning. After cloning, it hosts the new loop. 125 class InstPartition { 126 using InstructionSet = SmallPtrSet<Instruction *, 8>; 127 128 public: 129 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 130 : DepCycle(DepCycle), OrigLoop(L) { 131 Set.insert(I); 132 } 133 134 /// Returns whether this partition contains a dependence cycle. 135 bool hasDepCycle() const { return DepCycle; } 136 137 /// Adds an instruction to this partition. 138 void add(Instruction *I) { Set.insert(I); } 139 140 /// Collection accessors. 141 InstructionSet::iterator begin() { return Set.begin(); } 142 InstructionSet::iterator end() { return Set.end(); } 143 InstructionSet::const_iterator begin() const { return Set.begin(); } 144 InstructionSet::const_iterator end() const { return Set.end(); } 145 bool empty() const { return Set.empty(); } 146 147 /// Moves this partition into \p Other. This partition becomes empty 148 /// after this. 149 void moveTo(InstPartition &Other) { 150 Other.Set.insert(Set.begin(), Set.end()); 151 Set.clear(); 152 Other.DepCycle |= DepCycle; 153 } 154 155 /// Populates the partition with a transitive closure of all the 156 /// instructions that the seeded instructions dependent on. 157 void populateUsedSet() { 158 // FIXME: We currently don't use control-dependence but simply include all 159 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 160 // up. 161 for (auto *B : OrigLoop->getBlocks()) 162 Set.insert(B->getTerminator()); 163 164 // Follow the use-def chains to form a transitive closure of all the 165 // instructions that the originally seeded instructions depend on. 166 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 167 while (!Worklist.empty()) { 168 Instruction *I = Worklist.pop_back_val(); 169 // Insert instructions from the loop that we depend on. 170 for (Value *V : I->operand_values()) { 171 auto *I = dyn_cast<Instruction>(V); 172 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 173 Worklist.push_back(I); 174 } 175 } 176 } 177 178 /// Clones the original loop. 179 /// 180 /// Updates LoopInfo and DominatorTree using the information that block \p 181 /// LoopDomBB dominates the loop. 182 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 183 unsigned Index, LoopInfo *LI, 184 DominatorTree *DT) { 185 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 186 VMap, Twine(".ldist") + Twine(Index), 187 LI, DT, ClonedLoopBlocks); 188 return ClonedLoop; 189 } 190 191 /// The cloned loop. If this partition is mapped to the original loop, 192 /// this is null. 193 const Loop *getClonedLoop() const { return ClonedLoop; } 194 195 /// Returns the loop where this partition ends up after distribution. 196 /// If this partition is mapped to the original loop then use the block from 197 /// the loop. 198 Loop *getDistributedLoop() const { 199 return ClonedLoop ? ClonedLoop : OrigLoop; 200 } 201 202 /// The VMap that is populated by cloning and then used in 203 /// remapinstruction to remap the cloned instructions. 204 ValueToValueMapTy &getVMap() { return VMap; } 205 206 /// Remaps the cloned instructions using VMap. 207 void remapInstructions() { 208 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 209 } 210 211 /// Based on the set of instructions selected for this partition, 212 /// removes the unnecessary ones. 213 void removeUnusedInsts() { 214 SmallVector<Instruction *, 8> Unused; 215 216 for (auto *Block : OrigLoop->getBlocks()) 217 for (auto &Inst : *Block) 218 if (!Set.count(&Inst)) { 219 Instruction *NewInst = &Inst; 220 if (!VMap.empty()) 221 NewInst = cast<Instruction>(VMap[NewInst]); 222 223 assert(!isa<BranchInst>(NewInst) && 224 "Branches are marked used early on"); 225 Unused.push_back(NewInst); 226 } 227 228 // Delete the instructions backwards, as it has a reduced likelihood of 229 // having to update as many def-use and use-def chains. 230 for (auto *Inst : reverse(Unused)) { 231 if (!Inst->use_empty()) 232 Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); 233 Inst->eraseFromParent(); 234 } 235 } 236 237 void print() const { 238 if (DepCycle) 239 dbgs() << " (cycle)\n"; 240 for (auto *I : Set) 241 // Prefix with the block name. 242 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 243 } 244 245 void printBlocks() const { 246 for (auto *BB : getDistributedLoop()->getBlocks()) 247 dbgs() << *BB; 248 } 249 250 private: 251 /// Instructions from OrigLoop selected for this partition. 252 InstructionSet Set; 253 254 /// Whether this partition contains a dependence cycle. 255 bool DepCycle; 256 257 /// The original loop. 258 Loop *OrigLoop; 259 260 /// The cloned loop. If this partition is mapped to the original loop, 261 /// this is null. 262 Loop *ClonedLoop = nullptr; 263 264 /// The blocks of ClonedLoop including the preheader. If this 265 /// partition is mapped to the original loop, this is empty. 266 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 267 268 /// These gets populated once the set of instructions have been 269 /// finalized. If this partition is mapped to the original loop, these are not 270 /// set. 271 ValueToValueMapTy VMap; 272 }; 273 274 /// Holds the set of Partitions. It populates them, merges them and then 275 /// clones the loops. 276 class InstPartitionContainer { 277 using InstToPartitionIdT = DenseMap<Instruction *, int>; 278 279 public: 280 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 281 : L(L), LI(LI), DT(DT) {} 282 283 /// Returns the number of partitions. 284 unsigned getSize() const { return PartitionContainer.size(); } 285 286 /// Adds \p Inst into the current partition if that is marked to 287 /// contain cycles. Otherwise start a new partition for it. 288 void addToCyclicPartition(Instruction *Inst) { 289 // If the current partition is non-cyclic. Start a new one. 290 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 291 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 292 else 293 PartitionContainer.back().add(Inst); 294 } 295 296 /// Adds \p Inst into a partition that is not marked to contain 297 /// dependence cycles. 298 /// 299 // Initially we isolate memory instructions into as many partitions as 300 // possible, then later we may merge them back together. 301 void addToNewNonCyclicPartition(Instruction *Inst) { 302 PartitionContainer.emplace_back(Inst, L); 303 } 304 305 /// Merges adjacent non-cyclic partitions. 306 /// 307 /// The idea is that we currently only want to isolate the non-vectorizable 308 /// partition. We could later allow more distribution among these partition 309 /// too. 310 void mergeAdjacentNonCyclic() { 311 mergeAdjacentPartitionsIf( 312 [](const InstPartition *P) { return !P->hasDepCycle(); }); 313 } 314 315 /// If a partition contains only conditional stores, we won't vectorize 316 /// it. Try to merge it with a previous cyclic partition. 317 void mergeNonIfConvertible() { 318 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 319 if (Partition->hasDepCycle()) 320 return true; 321 322 // Now, check if all stores are conditional in this partition. 323 bool seenStore = false; 324 325 for (auto *Inst : *Partition) 326 if (isa<StoreInst>(Inst)) { 327 seenStore = true; 328 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 329 return false; 330 } 331 return seenStore; 332 }); 333 } 334 335 /// Merges the partitions according to various heuristics. 336 void mergeBeforePopulating() { 337 mergeAdjacentNonCyclic(); 338 if (!DistributeNonIfConvertible) 339 mergeNonIfConvertible(); 340 } 341 342 /// Merges partitions in order to ensure that no loads are duplicated. 343 /// 344 /// We can't duplicate loads because that could potentially reorder them. 345 /// LoopAccessAnalysis provides dependency information with the context that 346 /// the order of memory operation is preserved. 347 /// 348 /// Return if any partitions were merged. 349 bool mergeToAvoidDuplicatedLoads() { 350 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 351 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 352 353 LoadToPartitionT LoadToPartition; 354 ToBeMergedT ToBeMerged; 355 356 // Step through the partitions and create equivalence between partitions 357 // that contain the same load. Also put partitions in between them in the 358 // same equivalence class to avoid reordering of memory operations. 359 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 360 E = PartitionContainer.end(); 361 I != E; ++I) { 362 auto *PartI = &*I; 363 364 // If a load occurs in two partitions PartI and PartJ, merge all 365 // partitions (PartI, PartJ] into PartI. 366 for (Instruction *Inst : *PartI) 367 if (isa<LoadInst>(Inst)) { 368 bool NewElt; 369 LoadToPartitionT::iterator LoadToPart; 370 371 std::tie(LoadToPart, NewElt) = 372 LoadToPartition.insert(std::make_pair(Inst, PartI)); 373 if (!NewElt) { 374 LLVM_DEBUG(dbgs() 375 << "Merging partitions due to this load in multiple " 376 << "partitions: " << PartI << ", " << LoadToPart->second 377 << "\n" 378 << *Inst << "\n"); 379 380 auto PartJ = I; 381 do { 382 --PartJ; 383 ToBeMerged.unionSets(PartI, &*PartJ); 384 } while (&*PartJ != LoadToPart->second); 385 } 386 } 387 } 388 if (ToBeMerged.empty()) 389 return false; 390 391 // Merge the member of an equivalence class into its class leader. This 392 // makes the members empty. 393 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 394 I != E; ++I) { 395 if (!I->isLeader()) 396 continue; 397 398 auto PartI = I->getData(); 399 for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 400 ToBeMerged.member_end())) { 401 PartJ->moveTo(*PartI); 402 } 403 } 404 405 // Remove the empty partitions. 406 PartitionContainer.remove_if( 407 [](const InstPartition &P) { return P.empty(); }); 408 409 return true; 410 } 411 412 /// Sets up the mapping between instructions to partitions. If the 413 /// instruction is duplicated across multiple partitions, set the entry to -1. 414 void setupPartitionIdOnInstructions() { 415 int PartitionID = 0; 416 for (const auto &Partition : PartitionContainer) { 417 for (Instruction *Inst : Partition) { 418 bool NewElt; 419 InstToPartitionIdT::iterator Iter; 420 421 std::tie(Iter, NewElt) = 422 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 423 if (!NewElt) 424 Iter->second = -1; 425 } 426 ++PartitionID; 427 } 428 } 429 430 /// Populates the partition with everything that the seeding 431 /// instructions require. 432 void populateUsedSet() { 433 for (auto &P : PartitionContainer) 434 P.populateUsedSet(); 435 } 436 437 /// This performs the main chunk of the work of cloning the loops for 438 /// the partitions. 439 void cloneLoops() { 440 BasicBlock *OrigPH = L->getLoopPreheader(); 441 // At this point the predecessor of the preheader is either the memcheck 442 // block or the top part of the original preheader. 443 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 444 assert(Pred && "Preheader does not have a single predecessor"); 445 BasicBlock *ExitBlock = L->getExitBlock(); 446 assert(ExitBlock && "No single exit block"); 447 Loop *NewLoop; 448 449 assert(!PartitionContainer.empty() && "at least two partitions expected"); 450 // We're cloning the preheader along with the loop so we already made sure 451 // it was empty. 452 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 453 "preheader not empty"); 454 455 // Preserve the original loop ID for use after the transformation. 456 MDNode *OrigLoopID = L->getLoopID(); 457 458 // Create a loop for each partition except the last. Clone the original 459 // loop before PH along with adding a preheader for the cloned loop. Then 460 // update PH to point to the newly added preheader. 461 BasicBlock *TopPH = OrigPH; 462 unsigned Index = getSize() - 1; 463 for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) { 464 NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 465 466 Part.getVMap()[ExitBlock] = TopPH; 467 Part.remapInstructions(); 468 setNewLoopID(OrigLoopID, &Part); 469 --Index; 470 TopPH = NewLoop->getLoopPreheader(); 471 } 472 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 473 474 // Also set a new loop ID for the last loop. 475 setNewLoopID(OrigLoopID, &PartitionContainer.back()); 476 477 // Now go in forward order and update the immediate dominator for the 478 // preheaders with the exiting block of the previous loop. Dominance 479 // within the loop is updated in cloneLoopWithPreheader. 480 for (auto Curr = PartitionContainer.cbegin(), 481 Next = std::next(PartitionContainer.cbegin()), 482 E = PartitionContainer.cend(); 483 Next != E; ++Curr, ++Next) 484 DT->changeImmediateDominator( 485 Next->getDistributedLoop()->getLoopPreheader(), 486 Curr->getDistributedLoop()->getExitingBlock()); 487 } 488 489 /// Removes the dead instructions from the cloned loops. 490 void removeUnusedInsts() { 491 for (auto &Partition : PartitionContainer) 492 Partition.removeUnusedInsts(); 493 } 494 495 /// For each memory pointer, it computes the partitionId the pointer is 496 /// used in. 497 /// 498 /// This returns an array of int where the I-th entry corresponds to I-th 499 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 500 /// partitions its entry is set to -1. 501 SmallVector<int, 8> 502 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 503 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 504 505 unsigned N = RtPtrCheck->Pointers.size(); 506 SmallVector<int, 8> PtrToPartitions(N); 507 for (unsigned I = 0; I < N; ++I) { 508 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 509 auto Instructions = 510 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 511 512 int &Partition = PtrToPartitions[I]; 513 // First set it to uninitialized. 514 Partition = -2; 515 for (Instruction *Inst : Instructions) { 516 // Note that this could be -1 if Inst is duplicated across multiple 517 // partitions. 518 int ThisPartition = this->InstToPartitionId[Inst]; 519 if (Partition == -2) 520 Partition = ThisPartition; 521 // -1 means belonging to multiple partitions. 522 else if (Partition == -1) 523 break; 524 else if (Partition != (int)ThisPartition) 525 Partition = -1; 526 } 527 assert(Partition != -2 && "Pointer not belonging to any partition"); 528 } 529 530 return PtrToPartitions; 531 } 532 533 void print(raw_ostream &OS) const { 534 unsigned Index = 0; 535 for (const auto &P : PartitionContainer) { 536 OS << "Partition " << Index++ << " (" << &P << "):\n"; 537 P.print(); 538 } 539 } 540 541 void dump() const { print(dbgs()); } 542 543 #ifndef NDEBUG 544 friend raw_ostream &operator<<(raw_ostream &OS, 545 const InstPartitionContainer &Partitions) { 546 Partitions.print(OS); 547 return OS; 548 } 549 #endif 550 551 void printBlocks() const { 552 unsigned Index = 0; 553 for (const auto &P : PartitionContainer) { 554 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 555 P.printBlocks(); 556 } 557 } 558 559 private: 560 using PartitionContainerT = std::list<InstPartition>; 561 562 /// List of partitions. 563 PartitionContainerT PartitionContainer; 564 565 /// Mapping from Instruction to partition Id. If the instruction 566 /// belongs to multiple partitions the entry contains -1. 567 InstToPartitionIdT InstToPartitionId; 568 569 Loop *L; 570 LoopInfo *LI; 571 DominatorTree *DT; 572 573 /// The control structure to merge adjacent partitions if both satisfy 574 /// the \p Predicate. 575 template <class UnaryPredicate> 576 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 577 InstPartition *PrevMatch = nullptr; 578 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 579 auto DoesMatch = Predicate(&*I); 580 if (PrevMatch == nullptr && DoesMatch) { 581 PrevMatch = &*I; 582 ++I; 583 } else if (PrevMatch != nullptr && DoesMatch) { 584 I->moveTo(*PrevMatch); 585 I = PartitionContainer.erase(I); 586 } else { 587 PrevMatch = nullptr; 588 ++I; 589 } 590 } 591 } 592 593 /// Assign new LoopIDs for the partition's cloned loop. 594 void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 595 std::optional<MDNode *> PartitionID = makeFollowupLoopID( 596 OrigLoopID, 597 {LLVMLoopDistributeFollowupAll, 598 Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 599 : LLVMLoopDistributeFollowupCoincident}); 600 if (PartitionID) { 601 Loop *NewLoop = Part->getDistributedLoop(); 602 NewLoop->setLoopID(*PartitionID); 603 } 604 } 605 }; 606 607 /// For each memory instruction, this class maintains difference of the 608 /// number of unsafe dependences that start out from this instruction minus 609 /// those that end here. 610 /// 611 /// By traversing the memory instructions in program order and accumulating this 612 /// number, we know whether any unsafe dependence crosses over a program point. 613 class MemoryInstructionDependences { 614 using Dependence = MemoryDepChecker::Dependence; 615 616 public: 617 struct Entry { 618 Instruction *Inst; 619 unsigned NumUnsafeDependencesStartOrEnd = 0; 620 621 Entry(Instruction *Inst) : Inst(Inst) {} 622 }; 623 624 using AccessesType = SmallVector<Entry, 8>; 625 626 AccessesType::const_iterator begin() const { return Accesses.begin(); } 627 AccessesType::const_iterator end() const { return Accesses.end(); } 628 629 MemoryInstructionDependences( 630 const SmallVectorImpl<Instruction *> &Instructions, 631 const SmallVectorImpl<Dependence> &Dependences) { 632 Accesses.append(Instructions.begin(), Instructions.end()); 633 634 LLVM_DEBUG(dbgs() << "Backward dependences:\n"); 635 for (const auto &Dep : Dependences) 636 if (Dep.isPossiblyBackward()) { 637 // Note that the designations source and destination follow the program 638 // order, i.e. source is always first. (The direction is given by the 639 // DepType.) 640 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 641 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 642 643 LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 644 } 645 } 646 647 private: 648 AccessesType Accesses; 649 }; 650 651 /// The actual class performing the per-loop work. 652 class LoopDistributeForLoop { 653 public: 654 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 655 ScalarEvolution *SE, LoopAccessInfoManager &LAIs, 656 OptimizationRemarkEmitter *ORE) 657 : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) { 658 setForced(); 659 } 660 661 /// Try to distribute an inner-most loop. 662 bool processLoop() { 663 assert(L->isInnermost() && "Only process inner loops."); 664 665 LLVM_DEBUG(dbgs() << "\nLDist: In \"" 666 << L->getHeader()->getParent()->getName() 667 << "\" checking " << *L << "\n"); 668 669 // Having a single exit block implies there's also one exiting block. 670 if (!L->getExitBlock()) 671 return fail("MultipleExitBlocks", "multiple exit blocks"); 672 if (!L->isLoopSimplifyForm()) 673 return fail("NotLoopSimplifyForm", 674 "loop is not in loop-simplify form"); 675 if (!L->isRotatedForm()) 676 return fail("NotBottomTested", "loop is not bottom tested"); 677 678 BasicBlock *PH = L->getLoopPreheader(); 679 680 LAI = &LAIs.getInfo(*L); 681 682 // Currently, we only distribute to isolate the part of the loop with 683 // dependence cycles to enable partial vectorization. 684 if (LAI->canVectorizeMemory()) 685 return fail("MemOpsCanBeVectorized", 686 "memory operations are safe for vectorization"); 687 688 auto *Dependences = LAI->getDepChecker().getDependences(); 689 if (!Dependences || Dependences->empty()) 690 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 691 692 InstPartitionContainer Partitions(L, LI, DT); 693 694 // First, go through each memory operation and assign them to consecutive 695 // partitions (the order of partitions follows program order). Put those 696 // with unsafe dependences into "cyclic" partition otherwise put each store 697 // in its own "non-cyclic" partition (we'll merge these later). 698 // 699 // Note that a memory operation (e.g. Load2 below) at a program point that 700 // has an unsafe dependence (Store3->Load1) spanning over it must be 701 // included in the same cyclic partition as the dependent operations. This 702 // is to preserve the original program order after distribution. E.g.: 703 // 704 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 705 // Load1 -. 1 0->1 706 // Load2 | /Unsafe/ 0 1 707 // Store3 -' -1 1->0 708 // Load4 0 0 709 // 710 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 711 // we just keep assigning to the same cyclic partition until 712 // NumUnsafeDependencesActive reaches 0. 713 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 714 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 715 *Dependences); 716 717 int NumUnsafeDependencesActive = 0; 718 for (const auto &InstDep : MID) { 719 Instruction *I = InstDep.Inst; 720 // We update NumUnsafeDependencesActive post-instruction, catch the 721 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 722 if (NumUnsafeDependencesActive || 723 InstDep.NumUnsafeDependencesStartOrEnd > 0) 724 Partitions.addToCyclicPartition(I); 725 else 726 Partitions.addToNewNonCyclicPartition(I); 727 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 728 assert(NumUnsafeDependencesActive >= 0 && 729 "Negative number of dependences active"); 730 } 731 732 // Add partitions for values used outside. These partitions can be out of 733 // order from the original program order. This is OK because if the 734 // partition uses a load we will merge this partition with the original 735 // partition of the load that we set up in the previous loop (see 736 // mergeToAvoidDuplicatedLoads). 737 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 738 for (auto *Inst : DefsUsedOutside) 739 Partitions.addToNewNonCyclicPartition(Inst); 740 741 LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 742 if (Partitions.getSize() < 2) 743 return fail("CantIsolateUnsafeDeps", 744 "cannot isolate unsafe dependencies"); 745 746 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 747 // should be able to vectorize these together. 748 Partitions.mergeBeforePopulating(); 749 LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 750 if (Partitions.getSize() < 2) 751 return fail("CantIsolateUnsafeDeps", 752 "cannot isolate unsafe dependencies"); 753 754 // Now, populate the partitions with non-memory operations. 755 Partitions.populateUsedSet(); 756 LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 757 758 // In order to preserve original lexical order for loads, keep them in the 759 // partition that we set up in the MemoryInstructionDependences loop. 760 if (Partitions.mergeToAvoidDuplicatedLoads()) { 761 LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 762 << Partitions); 763 if (Partitions.getSize() < 2) 764 return fail("CantIsolateUnsafeDeps", 765 "cannot isolate unsafe dependencies"); 766 } 767 768 // Don't distribute the loop if we need too many SCEV run-time checks, or 769 // any if it's illegal. 770 const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); 771 if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 772 return fail("RuntimeCheckWithConvergent", 773 "may not insert runtime check with convergent operation"); 774 } 775 776 if (Pred.getComplexity() > (IsForced.value_or(false) 777 ? PragmaDistributeSCEVCheckThreshold 778 : DistributeSCEVCheckThreshold)) 779 return fail("TooManySCEVRuntimeChecks", 780 "too many SCEV run-time checks needed.\n"); 781 782 if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) 783 return fail("HeuristicDisabled", "distribution heuristic disabled"); 784 785 LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 786 // We're done forming the partitions set up the reverse mapping from 787 // instructions to partitions. 788 Partitions.setupPartitionIdOnInstructions(); 789 790 // If we need run-time checks, version the loop now. 791 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 792 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 793 const auto &AllChecks = RtPtrChecking->getChecks(); 794 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 795 RtPtrChecking); 796 797 if (LAI->hasConvergentOp() && !Checks.empty()) { 798 return fail("RuntimeCheckWithConvergent", 799 "may not insert runtime check with convergent operation"); 800 } 801 802 // To keep things simple have an empty preheader before we version or clone 803 // the loop. (Also split if this has no predecessor, i.e. entry, because we 804 // rely on PH having a predecessor.) 805 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 806 SplitBlock(PH, PH->getTerminator(), DT, LI); 807 808 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 809 assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 810 811 MDNode *OrigLoopID = L->getLoopID(); 812 813 LLVM_DEBUG(dbgs() << "\nPointers:\n"); 814 LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 815 LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); 816 LVer.versionLoop(DefsUsedOutside); 817 LVer.annotateLoopWithNoAlias(); 818 819 // The unversioned loop will not be changed, so we inherit all attributes 820 // from the original loop, but remove the loop distribution metadata to 821 // avoid to distribute it again. 822 MDNode *UnversionedLoopID = *makeFollowupLoopID( 823 OrigLoopID, 824 {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, 825 "llvm.loop.distribute.", true); 826 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 827 } 828 829 // Create identical copies of the original loop for each partition and hook 830 // them up sequentially. 831 Partitions.cloneLoops(); 832 833 // Now, we remove the instruction from each loop that don't belong to that 834 // partition. 835 Partitions.removeUnusedInsts(); 836 LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 837 LLVM_DEBUG(Partitions.printBlocks()); 838 839 if (LDistVerify) { 840 LI->verify(*DT); 841 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 842 } 843 844 ++NumLoopsDistributed; 845 // Report the success. 846 ORE->emit([&]() { 847 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 848 L->getHeader()) 849 << "distributed loop"; 850 }); 851 return true; 852 } 853 854 /// Provide diagnostics then \return with false. 855 bool fail(StringRef RemarkName, StringRef Message) { 856 LLVMContext &Ctx = F->getContext(); 857 bool Forced = isForced().value_or(false); 858 859 LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); 860 861 // With Rpass-missed report that distribution failed. 862 ORE->emit([&]() { 863 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 864 L->getStartLoc(), L->getHeader()) 865 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 866 "more " 867 "info"; 868 }); 869 870 // With Rpass-analysis report why. This is on by default if distribution 871 // was requested explicitly. 872 ORE->emit(OptimizationRemarkAnalysis( 873 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 874 RemarkName, L->getStartLoc(), L->getHeader()) 875 << "loop not distributed: " << Message); 876 877 // Also issue a warning if distribution was requested explicitly but it 878 // failed. 879 if (Forced) 880 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 881 *F, L->getStartLoc(), "loop not distributed: failed " 882 "explicitly specified loop distribution")); 883 884 return false; 885 } 886 887 /// Return if distribution forced to be enabled/disabled for the loop. 888 /// 889 /// If the optional has a value, it indicates whether distribution was forced 890 /// to be enabled (true) or disabled (false). If the optional has no value 891 /// distribution was not forced either way. 892 const std::optional<bool> &isForced() const { return IsForced; } 893 894 private: 895 /// Filter out checks between pointers from the same partition. 896 /// 897 /// \p PtrToPartition contains the partition number for pointers. Partition 898 /// number -1 means that the pointer is used in multiple partitions. In this 899 /// case we can't safely omit the check. 900 SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 901 const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 902 const SmallVectorImpl<int> &PtrToPartition, 903 const RuntimePointerChecking *RtPtrChecking) { 904 SmallVector<RuntimePointerCheck, 4> Checks; 905 906 copy_if(AllChecks, std::back_inserter(Checks), 907 [&](const RuntimePointerCheck &Check) { 908 for (unsigned PtrIdx1 : Check.first->Members) 909 for (unsigned PtrIdx2 : Check.second->Members) 910 // Only include this check if there is a pair of pointers 911 // that require checking and the pointers fall into 912 // separate partitions. 913 // 914 // (Note that we already know at this point that the two 915 // pointer groups need checking but it doesn't follow 916 // that each pair of pointers within the two groups need 917 // checking as well. 918 // 919 // In other words we don't want to include a check just 920 // because there is a pair of pointers between the two 921 // pointer groups that require checks and a different 922 // pair whose pointers fall into different partitions.) 923 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 924 !RuntimePointerChecking::arePointersInSamePartition( 925 PtrToPartition, PtrIdx1, PtrIdx2)) 926 return true; 927 return false; 928 }); 929 930 return Checks; 931 } 932 933 /// Check whether the loop metadata is forcing distribution to be 934 /// enabled/disabled. 935 void setForced() { 936 std::optional<const MDOperand *> Value = 937 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 938 if (!Value) 939 return; 940 941 const MDOperand *Op = *Value; 942 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 943 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 944 } 945 946 Loop *L; 947 Function *F; 948 949 // Analyses used. 950 LoopInfo *LI; 951 const LoopAccessInfo *LAI = nullptr; 952 DominatorTree *DT; 953 ScalarEvolution *SE; 954 LoopAccessInfoManager &LAIs; 955 OptimizationRemarkEmitter *ORE; 956 957 /// Indicates whether distribution is forced to be enabled/disabled for 958 /// the loop. 959 /// 960 /// If the optional has a value, it indicates whether distribution was forced 961 /// to be enabled (true) or disabled (false). If the optional has no value 962 /// distribution was not forced either way. 963 std::optional<bool> IsForced; 964 }; 965 966 } // end anonymous namespace 967 968 /// Shared implementation between new and old PMs. 969 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 970 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 971 LoopAccessInfoManager &LAIs) { 972 // Build up a worklist of inner-loops to vectorize. This is necessary as the 973 // act of distributing a loop creates new loops and can invalidate iterators 974 // across the loops. 975 SmallVector<Loop *, 8> Worklist; 976 977 for (Loop *TopLevelLoop : *LI) 978 for (Loop *L : depth_first(TopLevelLoop)) 979 // We only handle inner-most loops. 980 if (L->isInnermost()) 981 Worklist.push_back(L); 982 983 // Now walk the identified inner loops. 984 bool Changed = false; 985 for (Loop *L : Worklist) { 986 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE); 987 988 // If distribution was forced for the specific loop to be 989 // enabled/disabled, follow that. Otherwise use the global flag. 990 if (LDL.isForced().value_or(EnableLoopDistribute)) 991 Changed |= LDL.processLoop(); 992 } 993 994 // Process each loop nest in the function. 995 return Changed; 996 } 997 998 namespace { 999 1000 /// The pass class. 1001 class LoopDistributeLegacy : public FunctionPass { 1002 public: 1003 static char ID; 1004 1005 LoopDistributeLegacy() : FunctionPass(ID) { 1006 // The default is set by the caller. 1007 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 1008 } 1009 1010 bool runOnFunction(Function &F) override { 1011 if (skipFunction(F)) 1012 return false; 1013 1014 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1015 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1016 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1017 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1018 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); 1019 1020 return runImpl(F, LI, DT, SE, ORE, LAIs); 1021 } 1022 1023 void getAnalysisUsage(AnalysisUsage &AU) const override { 1024 AU.addRequired<ScalarEvolutionWrapperPass>(); 1025 AU.addRequired<LoopInfoWrapperPass>(); 1026 AU.addPreserved<LoopInfoWrapperPass>(); 1027 AU.addRequired<LoopAccessLegacyAnalysis>(); 1028 AU.addRequired<DominatorTreeWrapperPass>(); 1029 AU.addPreserved<DominatorTreeWrapperPass>(); 1030 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1031 AU.addPreserved<GlobalsAAWrapperPass>(); 1032 } 1033 }; 1034 1035 } // end anonymous namespace 1036 1037 PreservedAnalyses LoopDistributePass::run(Function &F, 1038 FunctionAnalysisManager &AM) { 1039 auto &LI = AM.getResult<LoopAnalysis>(F); 1040 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1041 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1042 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1043 1044 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 1045 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs); 1046 if (!Changed) 1047 return PreservedAnalyses::all(); 1048 PreservedAnalyses PA; 1049 PA.preserve<LoopAnalysis>(); 1050 PA.preserve<DominatorTreeAnalysis>(); 1051 return PA; 1052 } 1053 1054 char LoopDistributeLegacy::ID; 1055 1056 static const char ldist_name[] = "Loop Distribution"; 1057 1058 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 1059 false) 1060 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1061 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 1062 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1063 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1064 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1065 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 1066 1067 FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 1068