1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Loop Distribution Pass. Its main focus is to 10 // distribute loops that cannot be vectorized due to dependence cycles. It 11 // tries to isolate the offending dependences into a new loop allowing 12 // vectorization of the remaining parts. 13 // 14 // For dependence analysis, the pass uses the LoopVectorizer's 15 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 16 // memory operations, special care is taken to preserve the lexical order of 17 // these operations. 18 // 19 // Similarly to the Vectorizer, the pass also supports loop versioning to 20 // run-time disambiguate potentially overlapping arrays. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/Scalar/LoopDistribute.h" 25 #include "llvm/ADT/DenseMap.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/Optional.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/ADT/Twine.h" 35 #include "llvm/ADT/iterator_range.h" 36 #include "llvm/Analysis/AliasAnalysis.h" 37 #include "llvm/Analysis/AssumptionCache.h" 38 #include "llvm/Analysis/GlobalsModRef.h" 39 #include "llvm/Analysis/LoopAccessAnalysis.h" 40 #include "llvm/Analysis/LoopAnalysisManager.h" 41 #include "llvm/Analysis/LoopInfo.h" 42 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 43 #include "llvm/Analysis/ScalarEvolution.h" 44 #include "llvm/Analysis/TargetLibraryInfo.h" 45 #include "llvm/Analysis/TargetTransformInfo.h" 46 #include "llvm/IR/BasicBlock.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DiagnosticInfo.h" 49 #include "llvm/IR/Dominators.h" 50 #include "llvm/IR/Function.h" 51 #include "llvm/IR/InstrTypes.h" 52 #include "llvm/IR/Instruction.h" 53 #include "llvm/IR/Instructions.h" 54 #include "llvm/IR/LLVMContext.h" 55 #include "llvm/IR/Metadata.h" 56 #include "llvm/IR/PassManager.h" 57 #include "llvm/IR/Value.h" 58 #include "llvm/InitializePasses.h" 59 #include "llvm/Pass.h" 60 #include "llvm/Support/Casting.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "llvm/Support/Debug.h" 63 #include "llvm/Support/raw_ostream.h" 64 #include "llvm/Transforms/Scalar.h" 65 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 66 #include "llvm/Transforms/Utils/Cloning.h" 67 #include "llvm/Transforms/Utils/LoopUtils.h" 68 #include "llvm/Transforms/Utils/LoopVersioning.h" 69 #include "llvm/Transforms/Utils/ValueMapper.h" 70 #include <cassert> 71 #include <functional> 72 #include <list> 73 #include <tuple> 74 #include <utility> 75 76 using namespace llvm; 77 78 #define LDIST_NAME "loop-distribute" 79 #define DEBUG_TYPE LDIST_NAME 80 81 /// @{ 82 /// Metadata attribute names 83 static const char *const LLVMLoopDistributeFollowupAll = 84 "llvm.loop.distribute.followup_all"; 85 static const char *const LLVMLoopDistributeFollowupCoincident = 86 "llvm.loop.distribute.followup_coincident"; 87 static const char *const LLVMLoopDistributeFollowupSequential = 88 "llvm.loop.distribute.followup_sequential"; 89 static const char *const LLVMLoopDistributeFollowupFallback = 90 "llvm.loop.distribute.followup_fallback"; 91 /// @} 92 93 static cl::opt<bool> 94 LDistVerify("loop-distribute-verify", cl::Hidden, 95 cl::desc("Turn on DominatorTree and LoopInfo verification " 96 "after Loop Distribution"), 97 cl::init(false)); 98 99 static cl::opt<bool> DistributeNonIfConvertible( 100 "loop-distribute-non-if-convertible", cl::Hidden, 101 cl::desc("Whether to distribute into a loop that may not be " 102 "if-convertible by the loop vectorizer"), 103 cl::init(false)); 104 105 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 106 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 107 cl::desc("The maximum number of SCEV checks allowed for Loop " 108 "Distribution")); 109 110 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 111 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 112 cl::Hidden, 113 cl::desc( 114 "The maximum number of SCEV checks allowed for Loop " 115 "Distribution for loop marked with #pragma loop distribute(enable)")); 116 117 static cl::opt<bool> EnableLoopDistribute( 118 "enable-loop-distribute", cl::Hidden, 119 cl::desc("Enable the new, experimental LoopDistribution Pass"), 120 cl::init(false)); 121 122 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 123 124 namespace { 125 126 /// Maintains the set of instructions of the loop for a partition before 127 /// cloning. After cloning, it hosts the new loop. 128 class InstPartition { 129 using InstructionSet = SmallPtrSet<Instruction *, 8>; 130 131 public: 132 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 133 : DepCycle(DepCycle), OrigLoop(L) { 134 Set.insert(I); 135 } 136 137 /// Returns whether this partition contains a dependence cycle. 138 bool hasDepCycle() const { return DepCycle; } 139 140 /// Adds an instruction to this partition. 141 void add(Instruction *I) { Set.insert(I); } 142 143 /// Collection accessors. 144 InstructionSet::iterator begin() { return Set.begin(); } 145 InstructionSet::iterator end() { return Set.end(); } 146 InstructionSet::const_iterator begin() const { return Set.begin(); } 147 InstructionSet::const_iterator end() const { return Set.end(); } 148 bool empty() const { return Set.empty(); } 149 150 /// Moves this partition into \p Other. This partition becomes empty 151 /// after this. 152 void moveTo(InstPartition &Other) { 153 Other.Set.insert(Set.begin(), Set.end()); 154 Set.clear(); 155 Other.DepCycle |= DepCycle; 156 } 157 158 /// Populates the partition with a transitive closure of all the 159 /// instructions that the seeded instructions dependent on. 160 void populateUsedSet() { 161 // FIXME: We currently don't use control-dependence but simply include all 162 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 163 // up. 164 for (auto *B : OrigLoop->getBlocks()) 165 Set.insert(B->getTerminator()); 166 167 // Follow the use-def chains to form a transitive closure of all the 168 // instructions that the originally seeded instructions depend on. 169 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 170 while (!Worklist.empty()) { 171 Instruction *I = Worklist.pop_back_val(); 172 // Insert instructions from the loop that we depend on. 173 for (Value *V : I->operand_values()) { 174 auto *I = dyn_cast<Instruction>(V); 175 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 176 Worklist.push_back(I); 177 } 178 } 179 } 180 181 /// Clones the original loop. 182 /// 183 /// Updates LoopInfo and DominatorTree using the information that block \p 184 /// LoopDomBB dominates the loop. 185 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 186 unsigned Index, LoopInfo *LI, 187 DominatorTree *DT) { 188 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 189 VMap, Twine(".ldist") + Twine(Index), 190 LI, DT, ClonedLoopBlocks); 191 return ClonedLoop; 192 } 193 194 /// The cloned loop. If this partition is mapped to the original loop, 195 /// this is null. 196 const Loop *getClonedLoop() const { return ClonedLoop; } 197 198 /// Returns the loop where this partition ends up after distribution. 199 /// If this partition is mapped to the original loop then use the block from 200 /// the loop. 201 Loop *getDistributedLoop() const { 202 return ClonedLoop ? ClonedLoop : OrigLoop; 203 } 204 205 /// The VMap that is populated by cloning and then used in 206 /// remapinstruction to remap the cloned instructions. 207 ValueToValueMapTy &getVMap() { return VMap; } 208 209 /// Remaps the cloned instructions using VMap. 210 void remapInstructions() { 211 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 212 } 213 214 /// Based on the set of instructions selected for this partition, 215 /// removes the unnecessary ones. 216 void removeUnusedInsts() { 217 SmallVector<Instruction *, 8> Unused; 218 219 for (auto *Block : OrigLoop->getBlocks()) 220 for (auto &Inst : *Block) 221 if (!Set.count(&Inst)) { 222 Instruction *NewInst = &Inst; 223 if (!VMap.empty()) 224 NewInst = cast<Instruction>(VMap[NewInst]); 225 226 assert(!isa<BranchInst>(NewInst) && 227 "Branches are marked used early on"); 228 Unused.push_back(NewInst); 229 } 230 231 // Delete the instructions backwards, as it has a reduced likelihood of 232 // having to update as many def-use and use-def chains. 233 for (auto *Inst : reverse(Unused)) { 234 if (!Inst->use_empty()) 235 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 236 Inst->eraseFromParent(); 237 } 238 } 239 240 void print() const { 241 if (DepCycle) 242 dbgs() << " (cycle)\n"; 243 for (auto *I : Set) 244 // Prefix with the block name. 245 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 246 } 247 248 void printBlocks() const { 249 for (auto *BB : getDistributedLoop()->getBlocks()) 250 dbgs() << *BB; 251 } 252 253 private: 254 /// Instructions from OrigLoop selected for this partition. 255 InstructionSet Set; 256 257 /// Whether this partition contains a dependence cycle. 258 bool DepCycle; 259 260 /// The original loop. 261 Loop *OrigLoop; 262 263 /// The cloned loop. If this partition is mapped to the original loop, 264 /// this is null. 265 Loop *ClonedLoop = nullptr; 266 267 /// The blocks of ClonedLoop including the preheader. If this 268 /// partition is mapped to the original loop, this is empty. 269 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 270 271 /// These gets populated once the set of instructions have been 272 /// finalized. If this partition is mapped to the original loop, these are not 273 /// set. 274 ValueToValueMapTy VMap; 275 }; 276 277 /// Holds the set of Partitions. It populates them, merges them and then 278 /// clones the loops. 279 class InstPartitionContainer { 280 using InstToPartitionIdT = DenseMap<Instruction *, int>; 281 282 public: 283 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 284 : L(L), LI(LI), DT(DT) {} 285 286 /// Returns the number of partitions. 287 unsigned getSize() const { return PartitionContainer.size(); } 288 289 /// Adds \p Inst into the current partition if that is marked to 290 /// contain cycles. Otherwise start a new partition for it. 291 void addToCyclicPartition(Instruction *Inst) { 292 // If the current partition is non-cyclic. Start a new one. 293 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 294 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 295 else 296 PartitionContainer.back().add(Inst); 297 } 298 299 /// Adds \p Inst into a partition that is not marked to contain 300 /// dependence cycles. 301 /// 302 // Initially we isolate memory instructions into as many partitions as 303 // possible, then later we may merge them back together. 304 void addToNewNonCyclicPartition(Instruction *Inst) { 305 PartitionContainer.emplace_back(Inst, L); 306 } 307 308 /// Merges adjacent non-cyclic partitions. 309 /// 310 /// The idea is that we currently only want to isolate the non-vectorizable 311 /// partition. We could later allow more distribution among these partition 312 /// too. 313 void mergeAdjacentNonCyclic() { 314 mergeAdjacentPartitionsIf( 315 [](const InstPartition *P) { return !P->hasDepCycle(); }); 316 } 317 318 /// If a partition contains only conditional stores, we won't vectorize 319 /// it. Try to merge it with a previous cyclic partition. 320 void mergeNonIfConvertible() { 321 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 322 if (Partition->hasDepCycle()) 323 return true; 324 325 // Now, check if all stores are conditional in this partition. 326 bool seenStore = false; 327 328 for (auto *Inst : *Partition) 329 if (isa<StoreInst>(Inst)) { 330 seenStore = true; 331 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 332 return false; 333 } 334 return seenStore; 335 }); 336 } 337 338 /// Merges the partitions according to various heuristics. 339 void mergeBeforePopulating() { 340 mergeAdjacentNonCyclic(); 341 if (!DistributeNonIfConvertible) 342 mergeNonIfConvertible(); 343 } 344 345 /// Merges partitions in order to ensure that no loads are duplicated. 346 /// 347 /// We can't duplicate loads because that could potentially reorder them. 348 /// LoopAccessAnalysis provides dependency information with the context that 349 /// the order of memory operation is preserved. 350 /// 351 /// Return if any partitions were merged. 352 bool mergeToAvoidDuplicatedLoads() { 353 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 354 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 355 356 LoadToPartitionT LoadToPartition; 357 ToBeMergedT ToBeMerged; 358 359 // Step through the partitions and create equivalence between partitions 360 // that contain the same load. Also put partitions in between them in the 361 // same equivalence class to avoid reordering of memory operations. 362 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 363 E = PartitionContainer.end(); 364 I != E; ++I) { 365 auto *PartI = &*I; 366 367 // If a load occurs in two partitions PartI and PartJ, merge all 368 // partitions (PartI, PartJ] into PartI. 369 for (Instruction *Inst : *PartI) 370 if (isa<LoadInst>(Inst)) { 371 bool NewElt; 372 LoadToPartitionT::iterator LoadToPart; 373 374 std::tie(LoadToPart, NewElt) = 375 LoadToPartition.insert(std::make_pair(Inst, PartI)); 376 if (!NewElt) { 377 LLVM_DEBUG(dbgs() 378 << "Merging partitions due to this load in multiple " 379 << "partitions: " << PartI << ", " << LoadToPart->second 380 << "\n" 381 << *Inst << "\n"); 382 383 auto PartJ = I; 384 do { 385 --PartJ; 386 ToBeMerged.unionSets(PartI, &*PartJ); 387 } while (&*PartJ != LoadToPart->second); 388 } 389 } 390 } 391 if (ToBeMerged.empty()) 392 return false; 393 394 // Merge the member of an equivalence class into its class leader. This 395 // makes the members empty. 396 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 397 I != E; ++I) { 398 if (!I->isLeader()) 399 continue; 400 401 auto PartI = I->getData(); 402 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 403 ToBeMerged.member_end())) { 404 PartJ->moveTo(*PartI); 405 } 406 } 407 408 // Remove the empty partitions. 409 PartitionContainer.remove_if( 410 [](const InstPartition &P) { return P.empty(); }); 411 412 return true; 413 } 414 415 /// Sets up the mapping between instructions to partitions. If the 416 /// instruction is duplicated across multiple partitions, set the entry to -1. 417 void setupPartitionIdOnInstructions() { 418 int PartitionID = 0; 419 for (const auto &Partition : PartitionContainer) { 420 for (Instruction *Inst : Partition) { 421 bool NewElt; 422 InstToPartitionIdT::iterator Iter; 423 424 std::tie(Iter, NewElt) = 425 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 426 if (!NewElt) 427 Iter->second = -1; 428 } 429 ++PartitionID; 430 } 431 } 432 433 /// Populates the partition with everything that the seeding 434 /// instructions require. 435 void populateUsedSet() { 436 for (auto &P : PartitionContainer) 437 P.populateUsedSet(); 438 } 439 440 /// This performs the main chunk of the work of cloning the loops for 441 /// the partitions. 442 void cloneLoops() { 443 BasicBlock *OrigPH = L->getLoopPreheader(); 444 // At this point the predecessor of the preheader is either the memcheck 445 // block or the top part of the original preheader. 446 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 447 assert(Pred && "Preheader does not have a single predecessor"); 448 BasicBlock *ExitBlock = L->getExitBlock(); 449 assert(ExitBlock && "No single exit block"); 450 Loop *NewLoop; 451 452 assert(!PartitionContainer.empty() && "at least two partitions expected"); 453 // We're cloning the preheader along with the loop so we already made sure 454 // it was empty. 455 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 456 "preheader not empty"); 457 458 // Preserve the original loop ID for use after the transformation. 459 MDNode *OrigLoopID = L->getLoopID(); 460 461 // Create a loop for each partition except the last. Clone the original 462 // loop before PH along with adding a preheader for the cloned loop. Then 463 // update PH to point to the newly added preheader. 464 BasicBlock *TopPH = OrigPH; 465 unsigned Index = getSize() - 1; 466 for (auto I = std::next(PartitionContainer.rbegin()), 467 E = PartitionContainer.rend(); 468 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 469 auto *Part = &*I; 470 471 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 472 473 Part->getVMap()[ExitBlock] = TopPH; 474 Part->remapInstructions(); 475 setNewLoopID(OrigLoopID, Part); 476 } 477 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 478 479 // Also set a new loop ID for the last loop. 480 setNewLoopID(OrigLoopID, &PartitionContainer.back()); 481 482 // Now go in forward order and update the immediate dominator for the 483 // preheaders with the exiting block of the previous loop. Dominance 484 // within the loop is updated in cloneLoopWithPreheader. 485 for (auto Curr = PartitionContainer.cbegin(), 486 Next = std::next(PartitionContainer.cbegin()), 487 E = PartitionContainer.cend(); 488 Next != E; ++Curr, ++Next) 489 DT->changeImmediateDominator( 490 Next->getDistributedLoop()->getLoopPreheader(), 491 Curr->getDistributedLoop()->getExitingBlock()); 492 } 493 494 /// Removes the dead instructions from the cloned loops. 495 void removeUnusedInsts() { 496 for (auto &Partition : PartitionContainer) 497 Partition.removeUnusedInsts(); 498 } 499 500 /// For each memory pointer, it computes the partitionId the pointer is 501 /// used in. 502 /// 503 /// This returns an array of int where the I-th entry corresponds to I-th 504 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 505 /// partitions its entry is set to -1. 506 SmallVector<int, 8> 507 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 508 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 509 510 unsigned N = RtPtrCheck->Pointers.size(); 511 SmallVector<int, 8> PtrToPartitions(N); 512 for (unsigned I = 0; I < N; ++I) { 513 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 514 auto Instructions = 515 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 516 517 int &Partition = PtrToPartitions[I]; 518 // First set it to uninitialized. 519 Partition = -2; 520 for (Instruction *Inst : Instructions) { 521 // Note that this could be -1 if Inst is duplicated across multiple 522 // partitions. 523 int ThisPartition = this->InstToPartitionId[Inst]; 524 if (Partition == -2) 525 Partition = ThisPartition; 526 // -1 means belonging to multiple partitions. 527 else if (Partition == -1) 528 break; 529 else if (Partition != (int)ThisPartition) 530 Partition = -1; 531 } 532 assert(Partition != -2 && "Pointer not belonging to any partition"); 533 } 534 535 return PtrToPartitions; 536 } 537 538 void print(raw_ostream &OS) const { 539 unsigned Index = 0; 540 for (const auto &P : PartitionContainer) { 541 OS << "Partition " << Index++ << " (" << &P << "):\n"; 542 P.print(); 543 } 544 } 545 546 void dump() const { print(dbgs()); } 547 548 #ifndef NDEBUG 549 friend raw_ostream &operator<<(raw_ostream &OS, 550 const InstPartitionContainer &Partitions) { 551 Partitions.print(OS); 552 return OS; 553 } 554 #endif 555 556 void printBlocks() const { 557 unsigned Index = 0; 558 for (const auto &P : PartitionContainer) { 559 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 560 P.printBlocks(); 561 } 562 } 563 564 private: 565 using PartitionContainerT = std::list<InstPartition>; 566 567 /// List of partitions. 568 PartitionContainerT PartitionContainer; 569 570 /// Mapping from Instruction to partition Id. If the instruction 571 /// belongs to multiple partitions the entry contains -1. 572 InstToPartitionIdT InstToPartitionId; 573 574 Loop *L; 575 LoopInfo *LI; 576 DominatorTree *DT; 577 578 /// The control structure to merge adjacent partitions if both satisfy 579 /// the \p Predicate. 580 template <class UnaryPredicate> 581 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 582 InstPartition *PrevMatch = nullptr; 583 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 584 auto DoesMatch = Predicate(&*I); 585 if (PrevMatch == nullptr && DoesMatch) { 586 PrevMatch = &*I; 587 ++I; 588 } else if (PrevMatch != nullptr && DoesMatch) { 589 I->moveTo(*PrevMatch); 590 I = PartitionContainer.erase(I); 591 } else { 592 PrevMatch = nullptr; 593 ++I; 594 } 595 } 596 } 597 598 /// Assign new LoopIDs for the partition's cloned loop. 599 void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 600 Optional<MDNode *> PartitionID = makeFollowupLoopID( 601 OrigLoopID, 602 {LLVMLoopDistributeFollowupAll, 603 Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 604 : LLVMLoopDistributeFollowupCoincident}); 605 if (PartitionID.hasValue()) { 606 Loop *NewLoop = Part->getDistributedLoop(); 607 NewLoop->setLoopID(PartitionID.getValue()); 608 } 609 } 610 }; 611 612 /// For each memory instruction, this class maintains difference of the 613 /// number of unsafe dependences that start out from this instruction minus 614 /// those that end here. 615 /// 616 /// By traversing the memory instructions in program order and accumulating this 617 /// number, we know whether any unsafe dependence crosses over a program point. 618 class MemoryInstructionDependences { 619 using Dependence = MemoryDepChecker::Dependence; 620 621 public: 622 struct Entry { 623 Instruction *Inst; 624 unsigned NumUnsafeDependencesStartOrEnd = 0; 625 626 Entry(Instruction *Inst) : Inst(Inst) {} 627 }; 628 629 using AccessesType = SmallVector<Entry, 8>; 630 631 AccessesType::const_iterator begin() const { return Accesses.begin(); } 632 AccessesType::const_iterator end() const { return Accesses.end(); } 633 634 MemoryInstructionDependences( 635 const SmallVectorImpl<Instruction *> &Instructions, 636 const SmallVectorImpl<Dependence> &Dependences) { 637 Accesses.append(Instructions.begin(), Instructions.end()); 638 639 LLVM_DEBUG(dbgs() << "Backward dependences:\n"); 640 for (auto &Dep : Dependences) 641 if (Dep.isPossiblyBackward()) { 642 // Note that the designations source and destination follow the program 643 // order, i.e. source is always first. (The direction is given by the 644 // DepType.) 645 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 646 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 647 648 LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 649 } 650 } 651 652 private: 653 AccessesType Accesses; 654 }; 655 656 /// The actual class performing the per-loop work. 657 class LoopDistributeForLoop { 658 public: 659 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 660 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 661 : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { 662 setForced(); 663 } 664 665 /// Try to distribute an inner-most loop. 666 bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 667 assert(L->empty() && "Only process inner loops."); 668 669 LLVM_DEBUG(dbgs() << "\nLDist: In \"" 670 << L->getHeader()->getParent()->getName() 671 << "\" checking " << *L << "\n"); 672 673 if (!L->getExitBlock()) 674 return fail("MultipleExitBlocks", "multiple exit blocks"); 675 if (!L->isLoopSimplifyForm()) 676 return fail("NotLoopSimplifyForm", 677 "loop is not in loop-simplify form"); 678 679 BasicBlock *PH = L->getLoopPreheader(); 680 681 // LAA will check that we only have a single exiting block. 682 LAI = &GetLAA(*L); 683 684 // Currently, we only distribute to isolate the part of the loop with 685 // dependence cycles to enable partial vectorization. 686 if (LAI->canVectorizeMemory()) 687 return fail("MemOpsCanBeVectorized", 688 "memory operations are safe for vectorization"); 689 690 auto *Dependences = LAI->getDepChecker().getDependences(); 691 if (!Dependences || Dependences->empty()) 692 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 693 694 InstPartitionContainer Partitions(L, LI, DT); 695 696 // First, go through each memory operation and assign them to consecutive 697 // partitions (the order of partitions follows program order). Put those 698 // with unsafe dependences into "cyclic" partition otherwise put each store 699 // in its own "non-cyclic" partition (we'll merge these later). 700 // 701 // Note that a memory operation (e.g. Load2 below) at a program point that 702 // has an unsafe dependence (Store3->Load1) spanning over it must be 703 // included in the same cyclic partition as the dependent operations. This 704 // is to preserve the original program order after distribution. E.g.: 705 // 706 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 707 // Load1 -. 1 0->1 708 // Load2 | /Unsafe/ 0 1 709 // Store3 -' -1 1->0 710 // Load4 0 0 711 // 712 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 713 // we just keep assigning to the same cyclic partition until 714 // NumUnsafeDependencesActive reaches 0. 715 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 716 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 717 *Dependences); 718 719 int NumUnsafeDependencesActive = 0; 720 for (auto &InstDep : MID) { 721 Instruction *I = InstDep.Inst; 722 // We update NumUnsafeDependencesActive post-instruction, catch the 723 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 724 if (NumUnsafeDependencesActive || 725 InstDep.NumUnsafeDependencesStartOrEnd > 0) 726 Partitions.addToCyclicPartition(I); 727 else 728 Partitions.addToNewNonCyclicPartition(I); 729 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 730 assert(NumUnsafeDependencesActive >= 0 && 731 "Negative number of dependences active"); 732 } 733 734 // Add partitions for values used outside. These partitions can be out of 735 // order from the original program order. This is OK because if the 736 // partition uses a load we will merge this partition with the original 737 // partition of the load that we set up in the previous loop (see 738 // mergeToAvoidDuplicatedLoads). 739 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 740 for (auto *Inst : DefsUsedOutside) 741 Partitions.addToNewNonCyclicPartition(Inst); 742 743 LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 744 if (Partitions.getSize() < 2) 745 return fail("CantIsolateUnsafeDeps", 746 "cannot isolate unsafe dependencies"); 747 748 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 749 // should be able to vectorize these together. 750 Partitions.mergeBeforePopulating(); 751 LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 752 if (Partitions.getSize() < 2) 753 return fail("CantIsolateUnsafeDeps", 754 "cannot isolate unsafe dependencies"); 755 756 // Now, populate the partitions with non-memory operations. 757 Partitions.populateUsedSet(); 758 LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 759 760 // In order to preserve original lexical order for loads, keep them in the 761 // partition that we set up in the MemoryInstructionDependences loop. 762 if (Partitions.mergeToAvoidDuplicatedLoads()) { 763 LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 764 << Partitions); 765 if (Partitions.getSize() < 2) 766 return fail("CantIsolateUnsafeDeps", 767 "cannot isolate unsafe dependencies"); 768 } 769 770 // Don't distribute the loop if we need too many SCEV run-time checks, or 771 // any if it's illegal. 772 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 773 if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 774 return fail("RuntimeCheckWithConvergent", 775 "may not insert runtime check with convergent operation"); 776 } 777 778 if (Pred.getComplexity() > (IsForced.getValueOr(false) 779 ? PragmaDistributeSCEVCheckThreshold 780 : DistributeSCEVCheckThreshold)) 781 return fail("TooManySCEVRuntimeChecks", 782 "too many SCEV run-time checks needed.\n"); 783 784 if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) 785 return fail("HeuristicDisabled", "distribution heuristic disabled"); 786 787 LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 788 // We're done forming the partitions set up the reverse mapping from 789 // instructions to partitions. 790 Partitions.setupPartitionIdOnInstructions(); 791 792 // To keep things simple have an empty preheader before we version or clone 793 // the loop. (Also split if this has no predecessor, i.e. entry, because we 794 // rely on PH having a predecessor.) 795 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 796 SplitBlock(PH, PH->getTerminator(), DT, LI); 797 798 // If we need run-time checks, version the loop now. 799 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 800 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 801 const auto &AllChecks = RtPtrChecking->getChecks(); 802 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 803 RtPtrChecking); 804 805 if (LAI->hasConvergentOp() && !Checks.empty()) { 806 return fail("RuntimeCheckWithConvergent", 807 "may not insert runtime check with convergent operation"); 808 } 809 810 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 811 assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 812 813 MDNode *OrigLoopID = L->getLoopID(); 814 815 LLVM_DEBUG(dbgs() << "\nPointers:\n"); 816 LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 817 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 818 LVer.setAliasChecks(std::move(Checks)); 819 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 820 LVer.versionLoop(DefsUsedOutside); 821 LVer.annotateLoopWithNoAlias(); 822 823 // The unversioned loop will not be changed, so we inherit all attributes 824 // from the original loop, but remove the loop distribution metadata to 825 // avoid to distribute it again. 826 MDNode *UnversionedLoopID = 827 makeFollowupLoopID(OrigLoopID, 828 {LLVMLoopDistributeFollowupAll, 829 LLVMLoopDistributeFollowupFallback}, 830 "llvm.loop.distribute.", true) 831 .getValue(); 832 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 833 } 834 835 // Create identical copies of the original loop for each partition and hook 836 // them up sequentially. 837 Partitions.cloneLoops(); 838 839 // Now, we remove the instruction from each loop that don't belong to that 840 // partition. 841 Partitions.removeUnusedInsts(); 842 LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 843 LLVM_DEBUG(Partitions.printBlocks()); 844 845 if (LDistVerify) { 846 LI->verify(*DT); 847 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 848 } 849 850 ++NumLoopsDistributed; 851 // Report the success. 852 ORE->emit([&]() { 853 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 854 L->getHeader()) 855 << "distributed loop"; 856 }); 857 return true; 858 } 859 860 /// Provide diagnostics then \return with false. 861 bool fail(StringRef RemarkName, StringRef Message) { 862 LLVMContext &Ctx = F->getContext(); 863 bool Forced = isForced().getValueOr(false); 864 865 LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); 866 867 // With Rpass-missed report that distribution failed. 868 ORE->emit([&]() { 869 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 870 L->getStartLoc(), L->getHeader()) 871 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 872 "more " 873 "info"; 874 }); 875 876 // With Rpass-analysis report why. This is on by default if distribution 877 // was requested explicitly. 878 ORE->emit(OptimizationRemarkAnalysis( 879 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 880 RemarkName, L->getStartLoc(), L->getHeader()) 881 << "loop not distributed: " << Message); 882 883 // Also issue a warning if distribution was requested explicitly but it 884 // failed. 885 if (Forced) 886 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 887 *F, L->getStartLoc(), "loop not distributed: failed " 888 "explicitly specified loop distribution")); 889 890 return false; 891 } 892 893 /// Return if distribution forced to be enabled/disabled for the loop. 894 /// 895 /// If the optional has a value, it indicates whether distribution was forced 896 /// to be enabled (true) or disabled (false). If the optional has no value 897 /// distribution was not forced either way. 898 const Optional<bool> &isForced() const { return IsForced; } 899 900 private: 901 /// Filter out checks between pointers from the same partition. 902 /// 903 /// \p PtrToPartition contains the partition number for pointers. Partition 904 /// number -1 means that the pointer is used in multiple partitions. In this 905 /// case we can't safely omit the check. 906 SmallVector<RuntimePointerChecking::PointerCheck, 4> 907 includeOnlyCrossPartitionChecks( 908 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 909 const SmallVectorImpl<int> &PtrToPartition, 910 const RuntimePointerChecking *RtPtrChecking) { 911 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 912 913 copy_if(AllChecks, std::back_inserter(Checks), 914 [&](const RuntimePointerChecking::PointerCheck &Check) { 915 for (unsigned PtrIdx1 : Check.first->Members) 916 for (unsigned PtrIdx2 : Check.second->Members) 917 // Only include this check if there is a pair of pointers 918 // that require checking and the pointers fall into 919 // separate partitions. 920 // 921 // (Note that we already know at this point that the two 922 // pointer groups need checking but it doesn't follow 923 // that each pair of pointers within the two groups need 924 // checking as well. 925 // 926 // In other words we don't want to include a check just 927 // because there is a pair of pointers between the two 928 // pointer groups that require checks and a different 929 // pair whose pointers fall into different partitions.) 930 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 931 !RuntimePointerChecking::arePointersInSamePartition( 932 PtrToPartition, PtrIdx1, PtrIdx2)) 933 return true; 934 return false; 935 }); 936 937 return Checks; 938 } 939 940 /// Check whether the loop metadata is forcing distribution to be 941 /// enabled/disabled. 942 void setForced() { 943 Optional<const MDOperand *> Value = 944 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 945 if (!Value) 946 return; 947 948 const MDOperand *Op = *Value; 949 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 950 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 951 } 952 953 Loop *L; 954 Function *F; 955 956 // Analyses used. 957 LoopInfo *LI; 958 const LoopAccessInfo *LAI = nullptr; 959 DominatorTree *DT; 960 ScalarEvolution *SE; 961 OptimizationRemarkEmitter *ORE; 962 963 /// Indicates whether distribution is forced to be enabled/disabled for 964 /// the loop. 965 /// 966 /// If the optional has a value, it indicates whether distribution was forced 967 /// to be enabled (true) or disabled (false). If the optional has no value 968 /// distribution was not forced either way. 969 Optional<bool> IsForced; 970 }; 971 972 } // end anonymous namespace 973 974 /// Shared implementation between new and old PMs. 975 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 976 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 977 std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 978 // Build up a worklist of inner-loops to vectorize. This is necessary as the 979 // act of distributing a loop creates new loops and can invalidate iterators 980 // across the loops. 981 SmallVector<Loop *, 8> Worklist; 982 983 for (Loop *TopLevelLoop : *LI) 984 for (Loop *L : depth_first(TopLevelLoop)) 985 // We only handle inner-most loops. 986 if (L->empty()) 987 Worklist.push_back(L); 988 989 // Now walk the identified inner loops. 990 bool Changed = false; 991 for (Loop *L : Worklist) { 992 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 993 994 // If distribution was forced for the specific loop to be 995 // enabled/disabled, follow that. Otherwise use the global flag. 996 if (LDL.isForced().getValueOr(EnableLoopDistribute)) 997 Changed |= LDL.processLoop(GetLAA); 998 } 999 1000 // Process each loop nest in the function. 1001 return Changed; 1002 } 1003 1004 namespace { 1005 1006 /// The pass class. 1007 class LoopDistributeLegacy : public FunctionPass { 1008 public: 1009 static char ID; 1010 1011 LoopDistributeLegacy() : FunctionPass(ID) { 1012 // The default is set by the caller. 1013 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 1014 } 1015 1016 bool runOnFunction(Function &F) override { 1017 if (skipFunction(F)) 1018 return false; 1019 1020 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1021 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1022 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1023 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1024 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1025 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1026 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1027 1028 return runImpl(F, LI, DT, SE, ORE, GetLAA); 1029 } 1030 1031 void getAnalysisUsage(AnalysisUsage &AU) const override { 1032 AU.addRequired<ScalarEvolutionWrapperPass>(); 1033 AU.addRequired<LoopInfoWrapperPass>(); 1034 AU.addPreserved<LoopInfoWrapperPass>(); 1035 AU.addRequired<LoopAccessLegacyAnalysis>(); 1036 AU.addRequired<DominatorTreeWrapperPass>(); 1037 AU.addPreserved<DominatorTreeWrapperPass>(); 1038 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1039 AU.addPreserved<GlobalsAAWrapperPass>(); 1040 } 1041 }; 1042 1043 } // end anonymous namespace 1044 1045 PreservedAnalyses LoopDistributePass::run(Function &F, 1046 FunctionAnalysisManager &AM) { 1047 auto &LI = AM.getResult<LoopAnalysis>(F); 1048 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1049 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1050 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1051 1052 // We don't directly need these analyses but they're required for loop 1053 // analyses so provide them below. 1054 auto &AA = AM.getResult<AAManager>(F); 1055 auto &AC = AM.getResult<AssumptionAnalysis>(F); 1056 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 1057 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 1058 1059 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 1060 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1061 [&](Loop &L) -> const LoopAccessInfo & { 1062 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; 1063 return LAM.getResult<LoopAccessAnalysis>(L, AR); 1064 }; 1065 1066 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); 1067 if (!Changed) 1068 return PreservedAnalyses::all(); 1069 PreservedAnalyses PA; 1070 PA.preserve<LoopAnalysis>(); 1071 PA.preserve<DominatorTreeAnalysis>(); 1072 PA.preserve<GlobalsAA>(); 1073 return PA; 1074 } 1075 1076 char LoopDistributeLegacy::ID; 1077 1078 static const char ldist_name[] = "Loop Distribution"; 1079 1080 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 1081 false) 1082 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1083 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 1084 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1085 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1086 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1087 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 1088 1089 FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 1090