1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Loop Distribution Pass. Its main focus is to 10 // distribute loops that cannot be vectorized due to dependence cycles. It 11 // tries to isolate the offending dependences into a new loop allowing 12 // vectorization of the remaining parts. 13 // 14 // For dependence analysis, the pass uses the LoopVectorizer's 15 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 16 // memory operations, special care is taken to preserve the lexical order of 17 // these operations. 18 // 19 // Similarly to the Vectorizer, the pass also supports loop versioning to 20 // run-time disambiguate potentially overlapping arrays. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/Scalar/LoopDistribute.h" 25 #include "llvm/ADT/DenseMap.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/Optional.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/ADT/Twine.h" 35 #include "llvm/ADT/iterator_range.h" 36 #include "llvm/Analysis/AssumptionCache.h" 37 #include "llvm/Analysis/GlobalsModRef.h" 38 #include "llvm/Analysis/LoopAccessAnalysis.h" 39 #include "llvm/Analysis/LoopAnalysisManager.h" 40 #include "llvm/Analysis/LoopInfo.h" 41 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 42 #include "llvm/Analysis/ScalarEvolution.h" 43 #include "llvm/Analysis/TargetLibraryInfo.h" 44 #include "llvm/Analysis/TargetTransformInfo.h" 45 #include "llvm/IR/BasicBlock.h" 46 #include "llvm/IR/Constants.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Dominators.h" 49 #include "llvm/IR/Function.h" 50 #include "llvm/IR/InstrTypes.h" 51 #include "llvm/IR/Instruction.h" 52 #include "llvm/IR/Instructions.h" 53 #include "llvm/IR/LLVMContext.h" 54 #include "llvm/IR/Metadata.h" 55 #include "llvm/IR/PassManager.h" 56 #include "llvm/IR/Value.h" 57 #include "llvm/InitializePasses.h" 58 #include "llvm/Pass.h" 59 #include "llvm/Support/Casting.h" 60 #include "llvm/Support/CommandLine.h" 61 #include "llvm/Support/Debug.h" 62 #include "llvm/Support/raw_ostream.h" 63 #include "llvm/Transforms/Scalar.h" 64 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65 #include "llvm/Transforms/Utils/Cloning.h" 66 #include "llvm/Transforms/Utils/LoopUtils.h" 67 #include "llvm/Transforms/Utils/LoopVersioning.h" 68 #include "llvm/Transforms/Utils/ValueMapper.h" 69 #include <cassert> 70 #include <functional> 71 #include <list> 72 #include <tuple> 73 #include <utility> 74 75 using namespace llvm; 76 77 #define LDIST_NAME "loop-distribute" 78 #define DEBUG_TYPE LDIST_NAME 79 80 /// @{ 81 /// Metadata attribute names 82 static const char *const LLVMLoopDistributeFollowupAll = 83 "llvm.loop.distribute.followup_all"; 84 static const char *const LLVMLoopDistributeFollowupCoincident = 85 "llvm.loop.distribute.followup_coincident"; 86 static const char *const LLVMLoopDistributeFollowupSequential = 87 "llvm.loop.distribute.followup_sequential"; 88 static const char *const LLVMLoopDistributeFollowupFallback = 89 "llvm.loop.distribute.followup_fallback"; 90 /// @} 91 92 static cl::opt<bool> 93 LDistVerify("loop-distribute-verify", cl::Hidden, 94 cl::desc("Turn on DominatorTree and LoopInfo verification " 95 "after Loop Distribution"), 96 cl::init(false)); 97 98 static cl::opt<bool> DistributeNonIfConvertible( 99 "loop-distribute-non-if-convertible", cl::Hidden, 100 cl::desc("Whether to distribute into a loop that may not be " 101 "if-convertible by the loop vectorizer"), 102 cl::init(false)); 103 104 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 105 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 106 cl::desc("The maximum number of SCEV checks allowed for Loop " 107 "Distribution")); 108 109 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 110 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 111 cl::Hidden, 112 cl::desc( 113 "The maximum number of SCEV checks allowed for Loop " 114 "Distribution for loop marked with #pragma loop distribute(enable)")); 115 116 static cl::opt<bool> EnableLoopDistribute( 117 "enable-loop-distribute", cl::Hidden, 118 cl::desc("Enable the new, experimental LoopDistribution Pass"), 119 cl::init(false)); 120 121 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 122 123 namespace { 124 125 /// Maintains the set of instructions of the loop for a partition before 126 /// cloning. After cloning, it hosts the new loop. 127 class InstPartition { 128 using InstructionSet = SmallPtrSet<Instruction *, 8>; 129 130 public: 131 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 132 : DepCycle(DepCycle), OrigLoop(L) { 133 Set.insert(I); 134 } 135 136 /// Returns whether this partition contains a dependence cycle. 137 bool hasDepCycle() const { return DepCycle; } 138 139 /// Adds an instruction to this partition. 140 void add(Instruction *I) { Set.insert(I); } 141 142 /// Collection accessors. 143 InstructionSet::iterator begin() { return Set.begin(); } 144 InstructionSet::iterator end() { return Set.end(); } 145 InstructionSet::const_iterator begin() const { return Set.begin(); } 146 InstructionSet::const_iterator end() const { return Set.end(); } 147 bool empty() const { return Set.empty(); } 148 149 /// Moves this partition into \p Other. This partition becomes empty 150 /// after this. 151 void moveTo(InstPartition &Other) { 152 Other.Set.insert(Set.begin(), Set.end()); 153 Set.clear(); 154 Other.DepCycle |= DepCycle; 155 } 156 157 /// Populates the partition with a transitive closure of all the 158 /// instructions that the seeded instructions dependent on. 159 void populateUsedSet() { 160 // FIXME: We currently don't use control-dependence but simply include all 161 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 162 // up. 163 for (auto *B : OrigLoop->getBlocks()) 164 Set.insert(B->getTerminator()); 165 166 // Follow the use-def chains to form a transitive closure of all the 167 // instructions that the originally seeded instructions depend on. 168 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 169 while (!Worklist.empty()) { 170 Instruction *I = Worklist.pop_back_val(); 171 // Insert instructions from the loop that we depend on. 172 for (Value *V : I->operand_values()) { 173 auto *I = dyn_cast<Instruction>(V); 174 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 175 Worklist.push_back(I); 176 } 177 } 178 } 179 180 /// Clones the original loop. 181 /// 182 /// Updates LoopInfo and DominatorTree using the information that block \p 183 /// LoopDomBB dominates the loop. 184 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 185 unsigned Index, LoopInfo *LI, 186 DominatorTree *DT) { 187 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 188 VMap, Twine(".ldist") + Twine(Index), 189 LI, DT, ClonedLoopBlocks); 190 return ClonedLoop; 191 } 192 193 /// The cloned loop. If this partition is mapped to the original loop, 194 /// this is null. 195 const Loop *getClonedLoop() const { return ClonedLoop; } 196 197 /// Returns the loop where this partition ends up after distribution. 198 /// If this partition is mapped to the original loop then use the block from 199 /// the loop. 200 Loop *getDistributedLoop() const { 201 return ClonedLoop ? ClonedLoop : OrigLoop; 202 } 203 204 /// The VMap that is populated by cloning and then used in 205 /// remapinstruction to remap the cloned instructions. 206 ValueToValueMapTy &getVMap() { return VMap; } 207 208 /// Remaps the cloned instructions using VMap. 209 void remapInstructions() { 210 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 211 } 212 213 /// Based on the set of instructions selected for this partition, 214 /// removes the unnecessary ones. 215 void removeUnusedInsts() { 216 SmallVector<Instruction *, 8> Unused; 217 218 for (auto *Block : OrigLoop->getBlocks()) 219 for (auto &Inst : *Block) 220 if (!Set.count(&Inst)) { 221 Instruction *NewInst = &Inst; 222 if (!VMap.empty()) 223 NewInst = cast<Instruction>(VMap[NewInst]); 224 225 assert(!isa<BranchInst>(NewInst) && 226 "Branches are marked used early on"); 227 Unused.push_back(NewInst); 228 } 229 230 // Delete the instructions backwards, as it has a reduced likelihood of 231 // having to update as many def-use and use-def chains. 232 for (auto *Inst : reverse(Unused)) { 233 if (!Inst->use_empty()) 234 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 235 Inst->eraseFromParent(); 236 } 237 } 238 239 void print() const { 240 if (DepCycle) 241 dbgs() << " (cycle)\n"; 242 for (auto *I : Set) 243 // Prefix with the block name. 244 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 245 } 246 247 void printBlocks() const { 248 for (auto *BB : getDistributedLoop()->getBlocks()) 249 dbgs() << *BB; 250 } 251 252 private: 253 /// Instructions from OrigLoop selected for this partition. 254 InstructionSet Set; 255 256 /// Whether this partition contains a dependence cycle. 257 bool DepCycle; 258 259 /// The original loop. 260 Loop *OrigLoop; 261 262 /// The cloned loop. If this partition is mapped to the original loop, 263 /// this is null. 264 Loop *ClonedLoop = nullptr; 265 266 /// The blocks of ClonedLoop including the preheader. If this 267 /// partition is mapped to the original loop, this is empty. 268 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 269 270 /// These gets populated once the set of instructions have been 271 /// finalized. If this partition is mapped to the original loop, these are not 272 /// set. 273 ValueToValueMapTy VMap; 274 }; 275 276 /// Holds the set of Partitions. It populates them, merges them and then 277 /// clones the loops. 278 class InstPartitionContainer { 279 using InstToPartitionIdT = DenseMap<Instruction *, int>; 280 281 public: 282 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 283 : L(L), LI(LI), DT(DT) {} 284 285 /// Returns the number of partitions. 286 unsigned getSize() const { return PartitionContainer.size(); } 287 288 /// Adds \p Inst into the current partition if that is marked to 289 /// contain cycles. Otherwise start a new partition for it. 290 void addToCyclicPartition(Instruction *Inst) { 291 // If the current partition is non-cyclic. Start a new one. 292 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 293 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 294 else 295 PartitionContainer.back().add(Inst); 296 } 297 298 /// Adds \p Inst into a partition that is not marked to contain 299 /// dependence cycles. 300 /// 301 // Initially we isolate memory instructions into as many partitions as 302 // possible, then later we may merge them back together. 303 void addToNewNonCyclicPartition(Instruction *Inst) { 304 PartitionContainer.emplace_back(Inst, L); 305 } 306 307 /// Merges adjacent non-cyclic partitions. 308 /// 309 /// The idea is that we currently only want to isolate the non-vectorizable 310 /// partition. We could later allow more distribution among these partition 311 /// too. 312 void mergeAdjacentNonCyclic() { 313 mergeAdjacentPartitionsIf( 314 [](const InstPartition *P) { return !P->hasDepCycle(); }); 315 } 316 317 /// If a partition contains only conditional stores, we won't vectorize 318 /// it. Try to merge it with a previous cyclic partition. 319 void mergeNonIfConvertible() { 320 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 321 if (Partition->hasDepCycle()) 322 return true; 323 324 // Now, check if all stores are conditional in this partition. 325 bool seenStore = false; 326 327 for (auto *Inst : *Partition) 328 if (isa<StoreInst>(Inst)) { 329 seenStore = true; 330 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 331 return false; 332 } 333 return seenStore; 334 }); 335 } 336 337 /// Merges the partitions according to various heuristics. 338 void mergeBeforePopulating() { 339 mergeAdjacentNonCyclic(); 340 if (!DistributeNonIfConvertible) 341 mergeNonIfConvertible(); 342 } 343 344 /// Merges partitions in order to ensure that no loads are duplicated. 345 /// 346 /// We can't duplicate loads because that could potentially reorder them. 347 /// LoopAccessAnalysis provides dependency information with the context that 348 /// the order of memory operation is preserved. 349 /// 350 /// Return if any partitions were merged. 351 bool mergeToAvoidDuplicatedLoads() { 352 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 353 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 354 355 LoadToPartitionT LoadToPartition; 356 ToBeMergedT ToBeMerged; 357 358 // Step through the partitions and create equivalence between partitions 359 // that contain the same load. Also put partitions in between them in the 360 // same equivalence class to avoid reordering of memory operations. 361 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 362 E = PartitionContainer.end(); 363 I != E; ++I) { 364 auto *PartI = &*I; 365 366 // If a load occurs in two partitions PartI and PartJ, merge all 367 // partitions (PartI, PartJ] into PartI. 368 for (Instruction *Inst : *PartI) 369 if (isa<LoadInst>(Inst)) { 370 bool NewElt; 371 LoadToPartitionT::iterator LoadToPart; 372 373 std::tie(LoadToPart, NewElt) = 374 LoadToPartition.insert(std::make_pair(Inst, PartI)); 375 if (!NewElt) { 376 LLVM_DEBUG(dbgs() 377 << "Merging partitions due to this load in multiple " 378 << "partitions: " << PartI << ", " << LoadToPart->second 379 << "\n" 380 << *Inst << "\n"); 381 382 auto PartJ = I; 383 do { 384 --PartJ; 385 ToBeMerged.unionSets(PartI, &*PartJ); 386 } while (&*PartJ != LoadToPart->second); 387 } 388 } 389 } 390 if (ToBeMerged.empty()) 391 return false; 392 393 // Merge the member of an equivalence class into its class leader. This 394 // makes the members empty. 395 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 396 I != E; ++I) { 397 if (!I->isLeader()) 398 continue; 399 400 auto PartI = I->getData(); 401 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 402 ToBeMerged.member_end())) { 403 PartJ->moveTo(*PartI); 404 } 405 } 406 407 // Remove the empty partitions. 408 PartitionContainer.remove_if( 409 [](const InstPartition &P) { return P.empty(); }); 410 411 return true; 412 } 413 414 /// Sets up the mapping between instructions to partitions. If the 415 /// instruction is duplicated across multiple partitions, set the entry to -1. 416 void setupPartitionIdOnInstructions() { 417 int PartitionID = 0; 418 for (const auto &Partition : PartitionContainer) { 419 for (Instruction *Inst : Partition) { 420 bool NewElt; 421 InstToPartitionIdT::iterator Iter; 422 423 std::tie(Iter, NewElt) = 424 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 425 if (!NewElt) 426 Iter->second = -1; 427 } 428 ++PartitionID; 429 } 430 } 431 432 /// Populates the partition with everything that the seeding 433 /// instructions require. 434 void populateUsedSet() { 435 for (auto &P : PartitionContainer) 436 P.populateUsedSet(); 437 } 438 439 /// This performs the main chunk of the work of cloning the loops for 440 /// the partitions. 441 void cloneLoops() { 442 BasicBlock *OrigPH = L->getLoopPreheader(); 443 // At this point the predecessor of the preheader is either the memcheck 444 // block or the top part of the original preheader. 445 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 446 assert(Pred && "Preheader does not have a single predecessor"); 447 BasicBlock *ExitBlock = L->getExitBlock(); 448 assert(ExitBlock && "No single exit block"); 449 Loop *NewLoop; 450 451 assert(!PartitionContainer.empty() && "at least two partitions expected"); 452 // We're cloning the preheader along with the loop so we already made sure 453 // it was empty. 454 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 455 "preheader not empty"); 456 457 // Preserve the original loop ID for use after the transformation. 458 MDNode *OrigLoopID = L->getLoopID(); 459 460 // Create a loop for each partition except the last. Clone the original 461 // loop before PH along with adding a preheader for the cloned loop. Then 462 // update PH to point to the newly added preheader. 463 BasicBlock *TopPH = OrigPH; 464 unsigned Index = getSize() - 1; 465 for (auto I = std::next(PartitionContainer.rbegin()), 466 E = PartitionContainer.rend(); 467 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 468 auto *Part = &*I; 469 470 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 471 472 Part->getVMap()[ExitBlock] = TopPH; 473 Part->remapInstructions(); 474 setNewLoopID(OrigLoopID, Part); 475 } 476 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 477 478 // Also set a new loop ID for the last loop. 479 setNewLoopID(OrigLoopID, &PartitionContainer.back()); 480 481 // Now go in forward order and update the immediate dominator for the 482 // preheaders with the exiting block of the previous loop. Dominance 483 // within the loop is updated in cloneLoopWithPreheader. 484 for (auto Curr = PartitionContainer.cbegin(), 485 Next = std::next(PartitionContainer.cbegin()), 486 E = PartitionContainer.cend(); 487 Next != E; ++Curr, ++Next) 488 DT->changeImmediateDominator( 489 Next->getDistributedLoop()->getLoopPreheader(), 490 Curr->getDistributedLoop()->getExitingBlock()); 491 } 492 493 /// Removes the dead instructions from the cloned loops. 494 void removeUnusedInsts() { 495 for (auto &Partition : PartitionContainer) 496 Partition.removeUnusedInsts(); 497 } 498 499 /// For each memory pointer, it computes the partitionId the pointer is 500 /// used in. 501 /// 502 /// This returns an array of int where the I-th entry corresponds to I-th 503 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 504 /// partitions its entry is set to -1. 505 SmallVector<int, 8> 506 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 507 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 508 509 unsigned N = RtPtrCheck->Pointers.size(); 510 SmallVector<int, 8> PtrToPartitions(N); 511 for (unsigned I = 0; I < N; ++I) { 512 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 513 auto Instructions = 514 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 515 516 int &Partition = PtrToPartitions[I]; 517 // First set it to uninitialized. 518 Partition = -2; 519 for (Instruction *Inst : Instructions) { 520 // Note that this could be -1 if Inst is duplicated across multiple 521 // partitions. 522 int ThisPartition = this->InstToPartitionId[Inst]; 523 if (Partition == -2) 524 Partition = ThisPartition; 525 // -1 means belonging to multiple partitions. 526 else if (Partition == -1) 527 break; 528 else if (Partition != (int)ThisPartition) 529 Partition = -1; 530 } 531 assert(Partition != -2 && "Pointer not belonging to any partition"); 532 } 533 534 return PtrToPartitions; 535 } 536 537 void print(raw_ostream &OS) const { 538 unsigned Index = 0; 539 for (const auto &P : PartitionContainer) { 540 OS << "Partition " << Index++ << " (" << &P << "):\n"; 541 P.print(); 542 } 543 } 544 545 void dump() const { print(dbgs()); } 546 547 #ifndef NDEBUG 548 friend raw_ostream &operator<<(raw_ostream &OS, 549 const InstPartitionContainer &Partitions) { 550 Partitions.print(OS); 551 return OS; 552 } 553 #endif 554 555 void printBlocks() const { 556 unsigned Index = 0; 557 for (const auto &P : PartitionContainer) { 558 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 559 P.printBlocks(); 560 } 561 } 562 563 private: 564 using PartitionContainerT = std::list<InstPartition>; 565 566 /// List of partitions. 567 PartitionContainerT PartitionContainer; 568 569 /// Mapping from Instruction to partition Id. If the instruction 570 /// belongs to multiple partitions the entry contains -1. 571 InstToPartitionIdT InstToPartitionId; 572 573 Loop *L; 574 LoopInfo *LI; 575 DominatorTree *DT; 576 577 /// The control structure to merge adjacent partitions if both satisfy 578 /// the \p Predicate. 579 template <class UnaryPredicate> 580 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 581 InstPartition *PrevMatch = nullptr; 582 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 583 auto DoesMatch = Predicate(&*I); 584 if (PrevMatch == nullptr && DoesMatch) { 585 PrevMatch = &*I; 586 ++I; 587 } else if (PrevMatch != nullptr && DoesMatch) { 588 I->moveTo(*PrevMatch); 589 I = PartitionContainer.erase(I); 590 } else { 591 PrevMatch = nullptr; 592 ++I; 593 } 594 } 595 } 596 597 /// Assign new LoopIDs for the partition's cloned loop. 598 void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 599 Optional<MDNode *> PartitionID = makeFollowupLoopID( 600 OrigLoopID, 601 {LLVMLoopDistributeFollowupAll, 602 Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 603 : LLVMLoopDistributeFollowupCoincident}); 604 if (PartitionID.hasValue()) { 605 Loop *NewLoop = Part->getDistributedLoop(); 606 NewLoop->setLoopID(PartitionID.getValue()); 607 } 608 } 609 }; 610 611 /// For each memory instruction, this class maintains difference of the 612 /// number of unsafe dependences that start out from this instruction minus 613 /// those that end here. 614 /// 615 /// By traversing the memory instructions in program order and accumulating this 616 /// number, we know whether any unsafe dependence crosses over a program point. 617 class MemoryInstructionDependences { 618 using Dependence = MemoryDepChecker::Dependence; 619 620 public: 621 struct Entry { 622 Instruction *Inst; 623 unsigned NumUnsafeDependencesStartOrEnd = 0; 624 625 Entry(Instruction *Inst) : Inst(Inst) {} 626 }; 627 628 using AccessesType = SmallVector<Entry, 8>; 629 630 AccessesType::const_iterator begin() const { return Accesses.begin(); } 631 AccessesType::const_iterator end() const { return Accesses.end(); } 632 633 MemoryInstructionDependences( 634 const SmallVectorImpl<Instruction *> &Instructions, 635 const SmallVectorImpl<Dependence> &Dependences) { 636 Accesses.append(Instructions.begin(), Instructions.end()); 637 638 LLVM_DEBUG(dbgs() << "Backward dependences:\n"); 639 for (auto &Dep : Dependences) 640 if (Dep.isPossiblyBackward()) { 641 // Note that the designations source and destination follow the program 642 // order, i.e. source is always first. (The direction is given by the 643 // DepType.) 644 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 645 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 646 647 LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 648 } 649 } 650 651 private: 652 AccessesType Accesses; 653 }; 654 655 /// The actual class performing the per-loop work. 656 class LoopDistributeForLoop { 657 public: 658 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 659 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 660 : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { 661 setForced(); 662 } 663 664 /// Try to distribute an inner-most loop. 665 bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 666 assert(L->isInnermost() && "Only process inner loops."); 667 668 LLVM_DEBUG(dbgs() << "\nLDist: In \"" 669 << L->getHeader()->getParent()->getName() 670 << "\" checking " << *L << "\n"); 671 672 // Having a single exit block implies there's also one exiting block. 673 if (!L->getExitBlock()) 674 return fail("MultipleExitBlocks", "multiple exit blocks"); 675 if (!L->isLoopSimplifyForm()) 676 return fail("NotLoopSimplifyForm", 677 "loop is not in loop-simplify form"); 678 if (!L->isRotatedForm()) 679 return fail("NotBottomTested", "loop is not bottom tested"); 680 681 BasicBlock *PH = L->getLoopPreheader(); 682 683 LAI = &GetLAA(*L); 684 685 // Currently, we only distribute to isolate the part of the loop with 686 // dependence cycles to enable partial vectorization. 687 if (LAI->canVectorizeMemory()) 688 return fail("MemOpsCanBeVectorized", 689 "memory operations are safe for vectorization"); 690 691 auto *Dependences = LAI->getDepChecker().getDependences(); 692 if (!Dependences || Dependences->empty()) 693 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 694 695 InstPartitionContainer Partitions(L, LI, DT); 696 697 // First, go through each memory operation and assign them to consecutive 698 // partitions (the order of partitions follows program order). Put those 699 // with unsafe dependences into "cyclic" partition otherwise put each store 700 // in its own "non-cyclic" partition (we'll merge these later). 701 // 702 // Note that a memory operation (e.g. Load2 below) at a program point that 703 // has an unsafe dependence (Store3->Load1) spanning over it must be 704 // included in the same cyclic partition as the dependent operations. This 705 // is to preserve the original program order after distribution. E.g.: 706 // 707 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 708 // Load1 -. 1 0->1 709 // Load2 | /Unsafe/ 0 1 710 // Store3 -' -1 1->0 711 // Load4 0 0 712 // 713 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 714 // we just keep assigning to the same cyclic partition until 715 // NumUnsafeDependencesActive reaches 0. 716 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 717 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 718 *Dependences); 719 720 int NumUnsafeDependencesActive = 0; 721 for (auto &InstDep : MID) { 722 Instruction *I = InstDep.Inst; 723 // We update NumUnsafeDependencesActive post-instruction, catch the 724 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 725 if (NumUnsafeDependencesActive || 726 InstDep.NumUnsafeDependencesStartOrEnd > 0) 727 Partitions.addToCyclicPartition(I); 728 else 729 Partitions.addToNewNonCyclicPartition(I); 730 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 731 assert(NumUnsafeDependencesActive >= 0 && 732 "Negative number of dependences active"); 733 } 734 735 // Add partitions for values used outside. These partitions can be out of 736 // order from the original program order. This is OK because if the 737 // partition uses a load we will merge this partition with the original 738 // partition of the load that we set up in the previous loop (see 739 // mergeToAvoidDuplicatedLoads). 740 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 741 for (auto *Inst : DefsUsedOutside) 742 Partitions.addToNewNonCyclicPartition(Inst); 743 744 LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 745 if (Partitions.getSize() < 2) 746 return fail("CantIsolateUnsafeDeps", 747 "cannot isolate unsafe dependencies"); 748 749 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 750 // should be able to vectorize these together. 751 Partitions.mergeBeforePopulating(); 752 LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 753 if (Partitions.getSize() < 2) 754 return fail("CantIsolateUnsafeDeps", 755 "cannot isolate unsafe dependencies"); 756 757 // Now, populate the partitions with non-memory operations. 758 Partitions.populateUsedSet(); 759 LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 760 761 // In order to preserve original lexical order for loads, keep them in the 762 // partition that we set up in the MemoryInstructionDependences loop. 763 if (Partitions.mergeToAvoidDuplicatedLoads()) { 764 LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 765 << Partitions); 766 if (Partitions.getSize() < 2) 767 return fail("CantIsolateUnsafeDeps", 768 "cannot isolate unsafe dependencies"); 769 } 770 771 // Don't distribute the loop if we need too many SCEV run-time checks, or 772 // any if it's illegal. 773 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 774 if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 775 return fail("RuntimeCheckWithConvergent", 776 "may not insert runtime check with convergent operation"); 777 } 778 779 if (Pred.getComplexity() > (IsForced.getValueOr(false) 780 ? PragmaDistributeSCEVCheckThreshold 781 : DistributeSCEVCheckThreshold)) 782 return fail("TooManySCEVRuntimeChecks", 783 "too many SCEV run-time checks needed.\n"); 784 785 if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) 786 return fail("HeuristicDisabled", "distribution heuristic disabled"); 787 788 LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 789 // We're done forming the partitions set up the reverse mapping from 790 // instructions to partitions. 791 Partitions.setupPartitionIdOnInstructions(); 792 793 // If we need run-time checks, version the loop now. 794 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 795 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 796 const auto &AllChecks = RtPtrChecking->getChecks(); 797 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 798 RtPtrChecking); 799 800 if (LAI->hasConvergentOp() && !Checks.empty()) { 801 return fail("RuntimeCheckWithConvergent", 802 "may not insert runtime check with convergent operation"); 803 } 804 805 // To keep things simple have an empty preheader before we version or clone 806 // the loop. (Also split if this has no predecessor, i.e. entry, because we 807 // rely on PH having a predecessor.) 808 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 809 SplitBlock(PH, PH->getTerminator(), DT, LI); 810 811 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 812 assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 813 814 MDNode *OrigLoopID = L->getLoopID(); 815 816 LLVM_DEBUG(dbgs() << "\nPointers:\n"); 817 LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 818 LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); 819 LVer.versionLoop(DefsUsedOutside); 820 LVer.annotateLoopWithNoAlias(); 821 822 // The unversioned loop will not be changed, so we inherit all attributes 823 // from the original loop, but remove the loop distribution metadata to 824 // avoid to distribute it again. 825 MDNode *UnversionedLoopID = 826 makeFollowupLoopID(OrigLoopID, 827 {LLVMLoopDistributeFollowupAll, 828 LLVMLoopDistributeFollowupFallback}, 829 "llvm.loop.distribute.", true) 830 .getValue(); 831 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 832 } 833 834 // Create identical copies of the original loop for each partition and hook 835 // them up sequentially. 836 Partitions.cloneLoops(); 837 838 // Now, we remove the instruction from each loop that don't belong to that 839 // partition. 840 Partitions.removeUnusedInsts(); 841 LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 842 LLVM_DEBUG(Partitions.printBlocks()); 843 844 if (LDistVerify) { 845 LI->verify(*DT); 846 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 847 } 848 849 ++NumLoopsDistributed; 850 // Report the success. 851 ORE->emit([&]() { 852 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 853 L->getHeader()) 854 << "distributed loop"; 855 }); 856 return true; 857 } 858 859 /// Provide diagnostics then \return with false. 860 bool fail(StringRef RemarkName, StringRef Message) { 861 LLVMContext &Ctx = F->getContext(); 862 bool Forced = isForced().getValueOr(false); 863 864 LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); 865 866 // With Rpass-missed report that distribution failed. 867 ORE->emit([&]() { 868 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 869 L->getStartLoc(), L->getHeader()) 870 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 871 "more " 872 "info"; 873 }); 874 875 // With Rpass-analysis report why. This is on by default if distribution 876 // was requested explicitly. 877 ORE->emit(OptimizationRemarkAnalysis( 878 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 879 RemarkName, L->getStartLoc(), L->getHeader()) 880 << "loop not distributed: " << Message); 881 882 // Also issue a warning if distribution was requested explicitly but it 883 // failed. 884 if (Forced) 885 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 886 *F, L->getStartLoc(), "loop not distributed: failed " 887 "explicitly specified loop distribution")); 888 889 return false; 890 } 891 892 /// Return if distribution forced to be enabled/disabled for the loop. 893 /// 894 /// If the optional has a value, it indicates whether distribution was forced 895 /// to be enabled (true) or disabled (false). If the optional has no value 896 /// distribution was not forced either way. 897 const Optional<bool> &isForced() const { return IsForced; } 898 899 private: 900 /// Filter out checks between pointers from the same partition. 901 /// 902 /// \p PtrToPartition contains the partition number for pointers. Partition 903 /// number -1 means that the pointer is used in multiple partitions. In this 904 /// case we can't safely omit the check. 905 SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 906 const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 907 const SmallVectorImpl<int> &PtrToPartition, 908 const RuntimePointerChecking *RtPtrChecking) { 909 SmallVector<RuntimePointerCheck, 4> Checks; 910 911 copy_if(AllChecks, std::back_inserter(Checks), 912 [&](const RuntimePointerCheck &Check) { 913 for (unsigned PtrIdx1 : Check.first->Members) 914 for (unsigned PtrIdx2 : Check.second->Members) 915 // Only include this check if there is a pair of pointers 916 // that require checking and the pointers fall into 917 // separate partitions. 918 // 919 // (Note that we already know at this point that the two 920 // pointer groups need checking but it doesn't follow 921 // that each pair of pointers within the two groups need 922 // checking as well. 923 // 924 // In other words we don't want to include a check just 925 // because there is a pair of pointers between the two 926 // pointer groups that require checks and a different 927 // pair whose pointers fall into different partitions.) 928 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 929 !RuntimePointerChecking::arePointersInSamePartition( 930 PtrToPartition, PtrIdx1, PtrIdx2)) 931 return true; 932 return false; 933 }); 934 935 return Checks; 936 } 937 938 /// Check whether the loop metadata is forcing distribution to be 939 /// enabled/disabled. 940 void setForced() { 941 Optional<const MDOperand *> Value = 942 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 943 if (!Value) 944 return; 945 946 const MDOperand *Op = *Value; 947 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 948 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 949 } 950 951 Loop *L; 952 Function *F; 953 954 // Analyses used. 955 LoopInfo *LI; 956 const LoopAccessInfo *LAI = nullptr; 957 DominatorTree *DT; 958 ScalarEvolution *SE; 959 OptimizationRemarkEmitter *ORE; 960 961 /// Indicates whether distribution is forced to be enabled/disabled for 962 /// the loop. 963 /// 964 /// If the optional has a value, it indicates whether distribution was forced 965 /// to be enabled (true) or disabled (false). If the optional has no value 966 /// distribution was not forced either way. 967 Optional<bool> IsForced; 968 }; 969 970 } // end anonymous namespace 971 972 /// Shared implementation between new and old PMs. 973 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 974 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 975 std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 976 // Build up a worklist of inner-loops to vectorize. This is necessary as the 977 // act of distributing a loop creates new loops and can invalidate iterators 978 // across the loops. 979 SmallVector<Loop *, 8> Worklist; 980 981 for (Loop *TopLevelLoop : *LI) 982 for (Loop *L : depth_first(TopLevelLoop)) 983 // We only handle inner-most loops. 984 if (L->isInnermost()) 985 Worklist.push_back(L); 986 987 // Now walk the identified inner loops. 988 bool Changed = false; 989 for (Loop *L : Worklist) { 990 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 991 992 // If distribution was forced for the specific loop to be 993 // enabled/disabled, follow that. Otherwise use the global flag. 994 if (LDL.isForced().getValueOr(EnableLoopDistribute)) 995 Changed |= LDL.processLoop(GetLAA); 996 } 997 998 // Process each loop nest in the function. 999 return Changed; 1000 } 1001 1002 namespace { 1003 1004 /// The pass class. 1005 class LoopDistributeLegacy : public FunctionPass { 1006 public: 1007 static char ID; 1008 1009 LoopDistributeLegacy() : FunctionPass(ID) { 1010 // The default is set by the caller. 1011 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 1012 } 1013 1014 bool runOnFunction(Function &F) override { 1015 if (skipFunction(F)) 1016 return false; 1017 1018 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1019 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1020 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1021 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1022 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1023 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1024 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1025 1026 return runImpl(F, LI, DT, SE, ORE, GetLAA); 1027 } 1028 1029 void getAnalysisUsage(AnalysisUsage &AU) const override { 1030 AU.addRequired<ScalarEvolutionWrapperPass>(); 1031 AU.addRequired<LoopInfoWrapperPass>(); 1032 AU.addPreserved<LoopInfoWrapperPass>(); 1033 AU.addRequired<LoopAccessLegacyAnalysis>(); 1034 AU.addRequired<DominatorTreeWrapperPass>(); 1035 AU.addPreserved<DominatorTreeWrapperPass>(); 1036 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1037 AU.addPreserved<GlobalsAAWrapperPass>(); 1038 } 1039 }; 1040 1041 } // end anonymous namespace 1042 1043 PreservedAnalyses LoopDistributePass::run(Function &F, 1044 FunctionAnalysisManager &AM) { 1045 auto &LI = AM.getResult<LoopAnalysis>(F); 1046 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1047 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1048 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1049 1050 // We don't directly need these analyses but they're required for loop 1051 // analyses so provide them below. 1052 auto &AA = AM.getResult<AAManager>(F); 1053 auto &AC = AM.getResult<AssumptionAnalysis>(F); 1054 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 1055 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 1056 1057 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 1058 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1059 [&](Loop &L) -> const LoopAccessInfo & { 1060 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 1061 TLI, TTI, nullptr, nullptr}; 1062 return LAM.getResult<LoopAccessAnalysis>(L, AR); 1063 }; 1064 1065 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); 1066 if (!Changed) 1067 return PreservedAnalyses::all(); 1068 PreservedAnalyses PA; 1069 PA.preserve<LoopAnalysis>(); 1070 PA.preserve<DominatorTreeAnalysis>(); 1071 PA.preserve<GlobalsAA>(); 1072 return PA; 1073 } 1074 1075 char LoopDistributeLegacy::ID; 1076 1077 static const char ldist_name[] = "Loop Distribution"; 1078 1079 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 1080 false) 1081 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1082 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 1083 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1084 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1085 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1086 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 1087 1088 FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 1089