//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the Loop Distribution Pass. Its main focus is to // distribute loops that cannot be vectorized due to dependence cycles. It // tries to isolate the offending dependences into a new loop allowing // vectorization of the remaining parts. // // For dependence analysis, the pass uses the LoopVectorizer's // LoopAccessAnalysis. Because this analysis presumes no change in the order of // memory operations, special care is taken to preserve the lexical order of // these operations. // // Similarly to the Vectorizer, the pass also supports loop versioning to // run-time disambiguate potentially overlapping arrays. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopDistribute.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include #include #include #include using namespace llvm; #define LDIST_NAME "loop-distribute" #define DEBUG_TYPE LDIST_NAME /// @{ /// Metadata attribute names static const char *const LLVMLoopDistributeFollowupAll = "llvm.loop.distribute.followup_all"; static const char *const LLVMLoopDistributeFollowupCoincident = "llvm.loop.distribute.followup_coincident"; static const char *const LLVMLoopDistributeFollowupSequential = "llvm.loop.distribute.followup_sequential"; static const char *const LLVMLoopDistributeFollowupFallback = "llvm.loop.distribute.followup_fallback"; /// @} static cl::opt LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification " "after Loop Distribution"), cl::init(false)); static cl::opt DistributeNonIfConvertible( "loop-distribute-non-if-convertible", cl::Hidden, cl::desc("Whether to distribute into a loop that may not be " "if-convertible by the loop vectorizer"), cl::init(false)); static cl::opt DistributeSCEVCheckThreshold( "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); static cl::opt PragmaDistributeSCEVCheckThreshold( "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), cl::Hidden, cl::desc( "The maximum number of SCEV checks allowed for Loop " "Distribution for loop marked with #pragma loop distribute(enable)")); static cl::opt EnableLoopDistribute( "enable-loop-distribute", cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass"), cl::init(false)); STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { /// Maintains the set of instructions of the loop for a partition before /// cloning. After cloning, it hosts the new loop. class InstPartition { using InstructionSet = SmallPtrSet; public: InstPartition(Instruction *I, Loop *L, bool DepCycle = false) : DepCycle(DepCycle), OrigLoop(L) { Set.insert(I); } /// Returns whether this partition contains a dependence cycle. bool hasDepCycle() const { return DepCycle; } /// Adds an instruction to this partition. void add(Instruction *I) { Set.insert(I); } /// Collection accessors. InstructionSet::iterator begin() { return Set.begin(); } InstructionSet::iterator end() { return Set.end(); } InstructionSet::const_iterator begin() const { return Set.begin(); } InstructionSet::const_iterator end() const { return Set.end(); } bool empty() const { return Set.empty(); } /// Moves this partition into \p Other. This partition becomes empty /// after this. void moveTo(InstPartition &Other) { Other.Set.insert(Set.begin(), Set.end()); Set.clear(); Other.DepCycle |= DepCycle; } /// Populates the partition with a transitive closure of all the /// instructions that the seeded instructions dependent on. void populateUsedSet() { // FIXME: We currently don't use control-dependence but simply include all // blocks (possibly empty at the end) and let simplifycfg mostly clean this // up. for (auto *B : OrigLoop->getBlocks()) Set.insert(B->getTerminator()); // Follow the use-def chains to form a transitive closure of all the // instructions that the originally seeded instructions depend on. SmallVector Worklist(Set.begin(), Set.end()); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); // Insert instructions from the loop that we depend on. for (Value *V : I->operand_values()) { auto *I = dyn_cast(V); if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) Worklist.push_back(I); } } } /// Clones the original loop. /// /// Updates LoopInfo and DominatorTree using the information that block \p /// LoopDomBB dominates the loop. Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, unsigned Index, LoopInfo *LI, DominatorTree *DT) { ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, VMap, Twine(".ldist") + Twine(Index), LI, DT, ClonedLoopBlocks); return ClonedLoop; } /// The cloned loop. If this partition is mapped to the original loop, /// this is null. const Loop *getClonedLoop() const { return ClonedLoop; } /// Returns the loop where this partition ends up after distribution. /// If this partition is mapped to the original loop then use the block from /// the loop. Loop *getDistributedLoop() const { return ClonedLoop ? ClonedLoop : OrigLoop; } /// The VMap that is populated by cloning and then used in /// remapinstruction to remap the cloned instructions. ValueToValueMapTy &getVMap() { return VMap; } /// Remaps the cloned instructions using VMap. void remapInstructions() { remapInstructionsInBlocks(ClonedLoopBlocks, VMap); } /// Based on the set of instructions selected for this partition, /// removes the unnecessary ones. void removeUnusedInsts() { SmallVector Unused; for (auto *Block : OrigLoop->getBlocks()) for (auto &Inst : *Block) if (!Set.count(&Inst)) { Instruction *NewInst = &Inst; if (!VMap.empty()) NewInst = cast(VMap[NewInst]); assert(!isa(NewInst) && "Branches are marked used early on"); Unused.push_back(NewInst); } // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. for (auto *Inst : reverse(Unused)) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); Inst->eraseFromParent(); } } void print() const { if (DepCycle) dbgs() << " (cycle)\n"; for (auto *I : Set) // Prefix with the block name. dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; } void printBlocks() const { for (auto *BB : getDistributedLoop()->getBlocks()) dbgs() << *BB; } private: /// Instructions from OrigLoop selected for this partition. InstructionSet Set; /// Whether this partition contains a dependence cycle. bool DepCycle; /// The original loop. Loop *OrigLoop; /// The cloned loop. If this partition is mapped to the original loop, /// this is null. Loop *ClonedLoop = nullptr; /// The blocks of ClonedLoop including the preheader. If this /// partition is mapped to the original loop, this is empty. SmallVector ClonedLoopBlocks; /// These gets populated once the set of instructions have been /// finalized. If this partition is mapped to the original loop, these are not /// set. ValueToValueMapTy VMap; }; /// Holds the set of Partitions. It populates them, merges them and then /// clones the loops. class InstPartitionContainer { using InstToPartitionIdT = DenseMap; public: InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) : L(L), LI(LI), DT(DT) {} /// Returns the number of partitions. unsigned getSize() const { return PartitionContainer.size(); } /// Adds \p Inst into the current partition if that is marked to /// contain cycles. Otherwise start a new partition for it. void addToCyclicPartition(Instruction *Inst) { // If the current partition is non-cyclic. Start a new one. if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); else PartitionContainer.back().add(Inst); } /// Adds \p Inst into a partition that is not marked to contain /// dependence cycles. /// // Initially we isolate memory instructions into as many partitions as // possible, then later we may merge them back together. void addToNewNonCyclicPartition(Instruction *Inst) { PartitionContainer.emplace_back(Inst, L); } /// Merges adjacent non-cyclic partitions. /// /// The idea is that we currently only want to isolate the non-vectorizable /// partition. We could later allow more distribution among these partition /// too. void mergeAdjacentNonCyclic() { mergeAdjacentPartitionsIf( [](const InstPartition *P) { return !P->hasDepCycle(); }); } /// If a partition contains only conditional stores, we won't vectorize /// it. Try to merge it with a previous cyclic partition. void mergeNonIfConvertible() { mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { if (Partition->hasDepCycle()) return true; // Now, check if all stores are conditional in this partition. bool seenStore = false; for (auto *Inst : *Partition) if (isa(Inst)) { seenStore = true; if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) return false; } return seenStore; }); } /// Merges the partitions according to various heuristics. void mergeBeforePopulating() { mergeAdjacentNonCyclic(); if (!DistributeNonIfConvertible) mergeNonIfConvertible(); } /// Merges partitions in order to ensure that no loads are duplicated. /// /// We can't duplicate loads because that could potentially reorder them. /// LoopAccessAnalysis provides dependency information with the context that /// the order of memory operation is preserved. /// /// Return if any partitions were merged. bool mergeToAvoidDuplicatedLoads() { using LoadToPartitionT = DenseMap; using ToBeMergedT = EquivalenceClasses; LoadToPartitionT LoadToPartition; ToBeMergedT ToBeMerged; // Step through the partitions and create equivalence between partitions // that contain the same load. Also put partitions in between them in the // same equivalence class to avoid reordering of memory operations. for (PartitionContainerT::iterator I = PartitionContainer.begin(), E = PartitionContainer.end(); I != E; ++I) { auto *PartI = &*I; // If a load occurs in two partitions PartI and PartJ, merge all // partitions (PartI, PartJ] into PartI. for (Instruction *Inst : *PartI) if (isa(Inst)) { bool NewElt; LoadToPartitionT::iterator LoadToPart; std::tie(LoadToPart, NewElt) = LoadToPartition.insert(std::make_pair(Inst, PartI)); if (!NewElt) { LLVM_DEBUG(dbgs() << "Merging partitions due to this load in multiple " << "partitions: " << PartI << ", " << LoadToPart->second << "\n" << *Inst << "\n"); auto PartJ = I; do { --PartJ; ToBeMerged.unionSets(PartI, &*PartJ); } while (&*PartJ != LoadToPart->second); } } } if (ToBeMerged.empty()) return false; // Merge the member of an equivalence class into its class leader. This // makes the members empty. for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); I != E; ++I) { if (!I->isLeader()) continue; auto PartI = I->getData(); for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), ToBeMerged.member_end())) { PartJ->moveTo(*PartI); } } // Remove the empty partitions. PartitionContainer.remove_if( [](const InstPartition &P) { return P.empty(); }); return true; } /// Sets up the mapping between instructions to partitions. If the /// instruction is duplicated across multiple partitions, set the entry to -1. void setupPartitionIdOnInstructions() { int PartitionID = 0; for (const auto &Partition : PartitionContainer) { for (Instruction *Inst : Partition) { bool NewElt; InstToPartitionIdT::iterator Iter; std::tie(Iter, NewElt) = InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); if (!NewElt) Iter->second = -1; } ++PartitionID; } } /// Populates the partition with everything that the seeding /// instructions require. void populateUsedSet() { for (auto &P : PartitionContainer) P.populateUsedSet(); } /// This performs the main chunk of the work of cloning the loops for /// the partitions. void cloneLoops() { BasicBlock *OrigPH = L->getLoopPreheader(); // At this point the predecessor of the preheader is either the memcheck // block or the top part of the original preheader. BasicBlock *Pred = OrigPH->getSinglePredecessor(); assert(Pred && "Preheader does not have a single predecessor"); BasicBlock *ExitBlock = L->getExitBlock(); assert(ExitBlock && "No single exit block"); Loop *NewLoop; assert(!PartitionContainer.empty() && "at least two partitions expected"); // We're cloning the preheader along with the loop so we already made sure // it was empty. assert(&*OrigPH->begin() == OrigPH->getTerminator() && "preheader not empty"); // Preserve the original loop ID for use after the transformation. MDNode *OrigLoopID = L->getLoopID(); // Create a loop for each partition except the last. Clone the original // loop before PH along with adding a preheader for the cloned loop. Then // update PH to point to the newly added preheader. BasicBlock *TopPH = OrigPH; unsigned Index = getSize() - 1; for (auto I = std::next(PartitionContainer.rbegin()), E = PartitionContainer.rend(); I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { auto *Part = &*I; NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); Part->getVMap()[ExitBlock] = TopPH; Part->remapInstructions(); setNewLoopID(OrigLoopID, Part); } Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); // Also set a new loop ID for the last loop. setNewLoopID(OrigLoopID, &PartitionContainer.back()); // Now go in forward order and update the immediate dominator for the // preheaders with the exiting block of the previous loop. Dominance // within the loop is updated in cloneLoopWithPreheader. for (auto Curr = PartitionContainer.cbegin(), Next = std::next(PartitionContainer.cbegin()), E = PartitionContainer.cend(); Next != E; ++Curr, ++Next) DT->changeImmediateDominator( Next->getDistributedLoop()->getLoopPreheader(), Curr->getDistributedLoop()->getExitingBlock()); } /// Removes the dead instructions from the cloned loops. void removeUnusedInsts() { for (auto &Partition : PartitionContainer) Partition.removeUnusedInsts(); } /// For each memory pointer, it computes the partitionId the pointer is /// used in. /// /// This returns an array of int where the I-th entry corresponds to I-th /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple /// partitions its entry is set to -1. SmallVector computePartitionSetForPointers(const LoopAccessInfo &LAI) { const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); unsigned N = RtPtrCheck->Pointers.size(); SmallVector PtrToPartitions(N); for (unsigned I = 0; I < N; ++I) { Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; auto Instructions = LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); int &Partition = PtrToPartitions[I]; // First set it to uninitialized. Partition = -2; for (Instruction *Inst : Instructions) { // Note that this could be -1 if Inst is duplicated across multiple // partitions. int ThisPartition = this->InstToPartitionId[Inst]; if (Partition == -2) Partition = ThisPartition; // -1 means belonging to multiple partitions. else if (Partition == -1) break; else if (Partition != (int)ThisPartition) Partition = -1; } assert(Partition != -2 && "Pointer not belonging to any partition"); } return PtrToPartitions; } void print(raw_ostream &OS) const { unsigned Index = 0; for (const auto &P : PartitionContainer) { OS << "Partition " << Index++ << " (" << &P << "):\n"; P.print(); } } void dump() const { print(dbgs()); } #ifndef NDEBUG friend raw_ostream &operator<<(raw_ostream &OS, const InstPartitionContainer &Partitions) { Partitions.print(OS); return OS; } #endif void printBlocks() const { unsigned Index = 0; for (const auto &P : PartitionContainer) { dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; P.printBlocks(); } } private: using PartitionContainerT = std::list; /// List of partitions. PartitionContainerT PartitionContainer; /// Mapping from Instruction to partition Id. If the instruction /// belongs to multiple partitions the entry contains -1. InstToPartitionIdT InstToPartitionId; Loop *L; LoopInfo *LI; DominatorTree *DT; /// The control structure to merge adjacent partitions if both satisfy /// the \p Predicate. template void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { InstPartition *PrevMatch = nullptr; for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { auto DoesMatch = Predicate(&*I); if (PrevMatch == nullptr && DoesMatch) { PrevMatch = &*I; ++I; } else if (PrevMatch != nullptr && DoesMatch) { I->moveTo(*PrevMatch); I = PartitionContainer.erase(I); } else { PrevMatch = nullptr; ++I; } } } /// Assign new LoopIDs for the partition's cloned loop. void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { Optional PartitionID = makeFollowupLoopID( OrigLoopID, {LLVMLoopDistributeFollowupAll, Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential : LLVMLoopDistributeFollowupCoincident}); if (PartitionID) { Loop *NewLoop = Part->getDistributedLoop(); NewLoop->setLoopID(PartitionID.value()); } } }; /// For each memory instruction, this class maintains difference of the /// number of unsafe dependences that start out from this instruction minus /// those that end here. /// /// By traversing the memory instructions in program order and accumulating this /// number, we know whether any unsafe dependence crosses over a program point. class MemoryInstructionDependences { using Dependence = MemoryDepChecker::Dependence; public: struct Entry { Instruction *Inst; unsigned NumUnsafeDependencesStartOrEnd = 0; Entry(Instruction *Inst) : Inst(Inst) {} }; using AccessesType = SmallVector; AccessesType::const_iterator begin() const { return Accesses.begin(); } AccessesType::const_iterator end() const { return Accesses.end(); } MemoryInstructionDependences( const SmallVectorImpl &Instructions, const SmallVectorImpl &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); LLVM_DEBUG(dbgs() << "Backward dependences:\n"); for (auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program // order, i.e. source is always first. (The direction is given by the // DepType.) ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); } } private: AccessesType Accesses; }; /// The actual class performing the per-loop work. class LoopDistributeForLoop { public: LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { setForced(); } /// Try to distribute an inner-most loop. bool processLoop(std::function &GetLAA) { assert(L->isInnermost() && "Only process inner loops."); LLVM_DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() << "\" checking " << *L << "\n"); // Having a single exit block implies there's also one exiting block. if (!L->getExitBlock()) return fail("MultipleExitBlocks", "multiple exit blocks"); if (!L->isLoopSimplifyForm()) return fail("NotLoopSimplifyForm", "loop is not in loop-simplify form"); if (!L->isRotatedForm()) return fail("NotBottomTested", "loop is not bottom tested"); BasicBlock *PH = L->getLoopPreheader(); LAI = &GetLAA(*L); // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. if (LAI->canVectorizeMemory()) return fail("MemOpsCanBeVectorized", "memory operations are safe for vectorization"); auto *Dependences = LAI->getDepChecker().getDependences(); if (!Dependences || Dependences->empty()) return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); InstPartitionContainer Partitions(L, LI, DT); // First, go through each memory operation and assign them to consecutive // partitions (the order of partitions follows program order). Put those // with unsafe dependences into "cyclic" partition otherwise put each store // in its own "non-cyclic" partition (we'll merge these later). // // Note that a memory operation (e.g. Load2 below) at a program point that // has an unsafe dependence (Store3->Load1) spanning over it must be // included in the same cyclic partition as the dependent operations. This // is to preserve the original program order after distribution. E.g.: // // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive // Load1 -. 1 0->1 // Load2 | /Unsafe/ 0 1 // Store3 -' -1 1->0 // Load4 0 0 // // NumUnsafeDependencesActive > 0 indicates this situation and in this case // we just keep assigning to the same cyclic partition until // NumUnsafeDependencesActive reaches 0. const MemoryDepChecker &DepChecker = LAI->getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), *Dependences); int NumUnsafeDependencesActive = 0; for (auto &InstDep : MID) { Instruction *I = InstDep.Inst; // We update NumUnsafeDependencesActive post-instruction, catch the // start of a dependence directly via NumUnsafeDependencesStartOrEnd. if (NumUnsafeDependencesActive || InstDep.NumUnsafeDependencesStartOrEnd > 0) Partitions.addToCyclicPartition(I); else Partitions.addToNewNonCyclicPartition(I); NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; assert(NumUnsafeDependencesActive >= 0 && "Negative number of dependences active"); } // Add partitions for values used outside. These partitions can be out of // order from the original program order. This is OK because if the // partition uses a load we will merge this partition with the original // partition of the load that we set up in the previous loop (see // mergeToAvoidDuplicatedLoads). auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); for (auto *Inst : DefsUsedOutside) Partitions.addToNewNonCyclicPartition(Inst); LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); // Now, populate the partitions with non-memory operations. Partitions.populateUsedSet(); LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); // In order to preserve original lexical order for loads, keep them in the // partition that we set up in the MemoryInstructionDependences loop. if (Partitions.mergeToAvoidDuplicatedLoads()) { LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); } // Don't distribute the loop if we need too many SCEV run-time checks, or // any if it's illegal. const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { return fail("RuntimeCheckWithConvergent", "may not insert runtime check with convergent operation"); } if (Pred.getComplexity() > (IsForced.value_or(false) ? PragmaDistributeSCEVCheckThreshold : DistributeSCEVCheckThreshold)) return fail("TooManySCEVRuntimeChecks", "too many SCEV run-time checks needed.\n"); if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) return fail("HeuristicDisabled", "distribution heuristic disabled"); LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. Partitions.setupPartitionIdOnInstructions(); // If we need run-time checks, version the loop now. auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); const auto &AllChecks = RtPtrChecking->getChecks(); auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); if (LAI->hasConvergentOp() && !Checks.empty()) { return fail("RuntimeCheckWithConvergent", "may not insert runtime check with convergent operation"); } // To keep things simple have an empty preheader before we version or clone // the loop. (Also split if this has no predecessor, i.e. entry, because we // rely on PH having a predecessor.) if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) SplitBlock(PH, PH->getTerminator(), DT, LI); if (!Pred.isAlwaysTrue() || !Checks.empty()) { assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); MDNode *OrigLoopID = L->getLoopID(); LLVM_DEBUG(dbgs() << "\nPointers:\n"); LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); LVer.versionLoop(DefsUsedOutside); LVer.annotateLoopWithNoAlias(); // The unversioned loop will not be changed, so we inherit all attributes // from the original loop, but remove the loop distribution metadata to // avoid to distribute it again. MDNode *UnversionedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, "llvm.loop.distribute.", true) .value(); LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); } // Create identical copies of the original loop for each partition and hook // them up sequentially. Partitions.cloneLoops(); // Now, we remove the instruction from each loop that don't belong to that // partition. Partitions.removeUnusedInsts(); LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); LLVM_DEBUG(Partitions.printBlocks()); if (LDistVerify) { LI->verify(*DT); assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } ++NumLoopsDistributed; // Report the success. ORE->emit([&]() { return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), L->getHeader()) << "distributed loop"; }); return true; } /// Provide diagnostics then \return with false. bool fail(StringRef RemarkName, StringRef Message) { LLVMContext &Ctx = F->getContext(); bool Forced = isForced().value_or(false); LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); // With Rpass-missed report that distribution failed. ORE->emit([&]() { return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(), L->getHeader()) << "loop not distributed: use -Rpass-analysis=loop-distribute for " "more " "info"; }); // With Rpass-analysis report why. This is on by default if distribution // was requested explicitly. ORE->emit(OptimizationRemarkAnalysis( Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, RemarkName, L->getStartLoc(), L->getHeader()) << "loop not distributed: " << Message); // Also issue a warning if distribution was requested explicitly but it // failed. if (Forced) Ctx.diagnose(DiagnosticInfoOptimizationFailure( *F, L->getStartLoc(), "loop not distributed: failed " "explicitly specified loop distribution")); return false; } /// Return if distribution forced to be enabled/disabled for the loop. /// /// If the optional has a value, it indicates whether distribution was forced /// to be enabled (true) or disabled (false). If the optional has no value /// distribution was not forced either way. const Optional &isForced() const { return IsForced; } private: /// Filter out checks between pointers from the same partition. /// /// \p PtrToPartition contains the partition number for pointers. Partition /// number -1 means that the pointer is used in multiple partitions. In this /// case we can't safely omit the check. SmallVector includeOnlyCrossPartitionChecks( const SmallVectorImpl &AllChecks, const SmallVectorImpl &PtrToPartition, const RuntimePointerChecking *RtPtrChecking) { SmallVector Checks; copy_if(AllChecks, std::back_inserter(Checks), [&](const RuntimePointerCheck &Check) { for (unsigned PtrIdx1 : Check.first->Members) for (unsigned PtrIdx2 : Check.second->Members) // Only include this check if there is a pair of pointers // that require checking and the pointers fall into // separate partitions. // // (Note that we already know at this point that the two // pointer groups need checking but it doesn't follow // that each pair of pointers within the two groups need // checking as well. // // In other words we don't want to include a check just // because there is a pair of pointers between the two // pointer groups that require checks and a different // pair whose pointers fall into different partitions.) if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && !RuntimePointerChecking::arePointersInSamePartition( PtrToPartition, PtrIdx1, PtrIdx2)) return true; return false; }); return Checks; } /// Check whether the loop metadata is forcing distribution to be /// enabled/disabled. void setForced() { Optional Value = findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); if (!Value) return; const MDOperand *Op = *Value; assert(Op && mdconst::hasa(*Op) && "invalid metadata"); IsForced = mdconst::extract(*Op)->getZExtValue(); } Loop *L; Function *F; // Analyses used. LoopInfo *LI; const LoopAccessInfo *LAI = nullptr; DominatorTree *DT; ScalarEvolution *SE; OptimizationRemarkEmitter *ORE; /// Indicates whether distribution is forced to be enabled/disabled for /// the loop. /// /// If the optional has a value, it indicates whether distribution was forced /// to be enabled (true) or disabled (false). If the optional has no value /// distribution was not forced either way. Optional IsForced; }; } // end anonymous namespace /// Shared implementation between new and old PMs. static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, std::function &GetLAA) { // Build up a worklist of inner-loops to vectorize. This is necessary as the // act of distributing a loop creates new loops and can invalidate iterators // across the loops. SmallVector Worklist; for (Loop *TopLevelLoop : *LI) for (Loop *L : depth_first(TopLevelLoop)) // We only handle inner-most loops. if (L->isInnermost()) Worklist.push_back(L); // Now walk the identified inner loops. bool Changed = false; for (Loop *L : Worklist) { LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); // If distribution was forced for the specific loop to be // enabled/disabled, follow that. Otherwise use the global flag. if (LDL.isForced().value_or(EnableLoopDistribute)) Changed |= LDL.processLoop(GetLAA); } // Process each loop nest in the function. return Changed; } namespace { /// The pass class. class LoopDistributeLegacy : public FunctionPass { public: static char ID; LoopDistributeLegacy() : FunctionPass(ID) { // The default is set by the caller. initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; auto *LI = &getAnalysis().getLoopInfo(); auto *LAA = &getAnalysis(); auto *DT = &getAnalysis().getDomTree(); auto *SE = &getAnalysis().getSE(); auto *ORE = &getAnalysis().getORE(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return runImpl(F, LI, DT, SE, ORE, GetLAA); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); } }; } // end anonymous namespace PreservedAnalyses LoopDistributePass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult(F); auto &DT = AM.getResult(F); auto &SE = AM.getResult(F); auto &ORE = AM.getResult(F); // We don't directly need these analyses but they're required for loop // analyses so provide them below. auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); auto &TTI = AM.getResult(F); auto &TLI = AM.getResult(F); auto &LAM = AM.getResult(F).getManager(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr, nullptr, nullptr}; return LAM.getResult(L, AR); }; bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve(); PA.preserve(); return PA; } char LoopDistributeLegacy::ID; static const char ldist_name[] = "Loop Distribution"; INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }