xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision 640242a5915761ce63205bdb0542fa3c1473c0ff)
1  //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // ExtTSP - layout of basic blocks with i-cache optimization.
10  //
11  // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
12  // optimizing jump locality and thus processor I-cache utilization. This is
13  // achieved via increasing the number of fall-through jumps and co-locating
14  // frequently executed nodes together. The name follows the underlying
15  // optimization problem, Extended-TSP, which is a generalization of classical
16  // (maximum) Traveling Salesmen Problem.
17  //
18  // The algorithm is a greedy heuristic that works with chains (ordered lists)
19  // of basic blocks. Initially all chains are isolated basic blocks. On every
20  // iteration, we pick a pair of chains whose merging yields the biggest increase
21  // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
22  // A pair of chains giving the maximum gain is merged into a new chain. The
23  // procedure stops when there is only one chain left, or when merging does not
24  // increase ExtTSP. In the latter case, the remaining chains are sorted by
25  // density in the decreasing order.
26  //
27  // An important aspect is the way two chains are merged. Unlike earlier
28  // algorithms (e.g., based on the approach of Pettis-Hansen), two
29  // chains, X and Y, are first split into three, X1, X2, and Y. Then we
30  // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
31  // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
32  // This improves the quality of the final result (the search space is larger)
33  // while keeping the implementation sufficiently fast.
34  //
35  // Reference:
36  //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
37  //     IEEE Transactions on Computers, 2020
38  //
39  //===----------------------------------------------------------------------===//
40  
41  #include "llvm/Transforms/Utils/CodeLayout.h"
42  #include "llvm/Support/CommandLine.h"
43  #include "llvm/Support/Debug.h"
44  
45  using namespace llvm;
46  #define DEBUG_TYPE "code-layout"
47  
48  // Algorithm-specific constants. The values are tuned for the best performance
49  // of large-scale front-end bound binaries.
50  static cl::opt<double>
51      ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1),
52                    cl::desc("The weight of forward jumps for ExtTSP value"));
53  
54  static cl::opt<double>
55      BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1),
56                     cl::desc("The weight of backward jumps for ExtTSP value"));
57  
58  static cl::opt<unsigned> ForwardDistance(
59      "ext-tsp-forward-distance", cl::Hidden, cl::init(1024),
60      cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
61  
62  static cl::opt<unsigned> BackwardDistance(
63      "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
64      cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
65  
66  // The maximum size of a chain for splitting. Larger values of the threshold
67  // may yield better quality at the cost of worsen run-time.
68  static cl::opt<unsigned> ChainSplitThreshold(
69      "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128),
70      cl::desc("The maximum size of a chain to apply splitting"));
71  
72  // The option enables splitting (large) chains along in-coming and out-going
73  // jumps. This typically results in a better quality.
74  static cl::opt<bool> EnableChainSplitAlongJumps(
75      "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true),
76      cl::desc("The maximum size of a chain to apply splitting"));
77  
78  namespace {
79  
80  // Epsilon for comparison of doubles.
81  constexpr double EPS = 1e-8;
82  
83  // Compute the Ext-TSP score for a jump between a given pair of blocks,
84  // using their sizes, (estimated) addresses and the jump execution count.
85  double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
86                     uint64_t Count) {
87    // Fallthrough
88    if (SrcAddr + SrcSize == DstAddr) {
89      // Assume that FallthroughWeight = 1.0 after normalization
90      return static_cast<double>(Count);
91    }
92    // Forward
93    if (SrcAddr + SrcSize < DstAddr) {
94      const auto Dist = DstAddr - (SrcAddr + SrcSize);
95      if (Dist <= ForwardDistance) {
96        double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance;
97        return ForwardWeight * Prob * Count;
98      }
99      return 0;
100    }
101    // Backward
102    const auto Dist = SrcAddr + SrcSize - DstAddr;
103    if (Dist <= BackwardDistance) {
104      double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance;
105      return BackwardWeight * Prob * Count;
106    }
107    return 0;
108  }
109  
110  /// A type of merging two chains, X and Y. The former chain is split into
111  /// X1 and X2 and then concatenated with Y in the order specified by the type.
112  enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
113  
114  /// The gain of merging two chains, that is, the Ext-TSP score of the merge
115  /// together with the corresponfiding merge 'type' and 'offset'.
116  class MergeGainTy {
117  public:
118    explicit MergeGainTy() {}
119    explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
120        : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
121  
122    double score() const { return Score; }
123  
124    size_t mergeOffset() const { return MergeOffset; }
125  
126    MergeTypeTy mergeType() const { return MergeType; }
127  
128    // Returns 'true' iff Other is preferred over this.
129    bool operator<(const MergeGainTy &Other) const {
130      return (Other.Score > EPS && Other.Score > Score + EPS);
131    }
132  
133    // Update the current gain if Other is preferred over this.
134    void updateIfLessThan(const MergeGainTy &Other) {
135      if (*this < Other)
136        *this = Other;
137    }
138  
139  private:
140    double Score{-1.0};
141    size_t MergeOffset{0};
142    MergeTypeTy MergeType{MergeTypeTy::X_Y};
143  };
144  
145  class Block;
146  class Jump;
147  class Chain;
148  class ChainEdge;
149  
150  /// A node in the graph, typically corresponding to a basic block in CFG.
151  class Block {
152  public:
153    Block(const Block &) = delete;
154    Block(Block &&) = default;
155    Block &operator=(const Block &) = delete;
156    Block &operator=(Block &&) = default;
157  
158    // The original index of the block in CFG.
159    size_t Index{0};
160    // The index of the block in the current chain.
161    size_t CurIndex{0};
162    // Size of the block in the binary.
163    uint64_t Size{0};
164    // Execution count of the block in the profile data.
165    uint64_t ExecutionCount{0};
166    // Current chain of the node.
167    Chain *CurChain{nullptr};
168    // An offset of the block in the current chain.
169    mutable uint64_t EstimatedAddr{0};
170    // Forced successor of the block in CFG.
171    Block *ForcedSucc{nullptr};
172    // Forced predecessor of the block in CFG.
173    Block *ForcedPred{nullptr};
174    // Outgoing jumps from the block.
175    std::vector<Jump *> OutJumps;
176    // Incoming jumps to the block.
177    std::vector<Jump *> InJumps;
178  
179  public:
180    explicit Block(size_t Index, uint64_t Size_, uint64_t EC)
181        : Index(Index), Size(Size_), ExecutionCount(EC) {}
182    bool isEntry() const { return Index == 0; }
183  };
184  
185  /// An arc in the graph, typically corresponding to a jump between two blocks.
186  class Jump {
187  public:
188    Jump(const Jump &) = delete;
189    Jump(Jump &&) = default;
190    Jump &operator=(const Jump &) = delete;
191    Jump &operator=(Jump &&) = default;
192  
193    // Source block of the jump.
194    Block *Source;
195    // Target block of the jump.
196    Block *Target;
197    // Execution count of the arc in the profile data.
198    uint64_t ExecutionCount{0};
199  
200  public:
201    explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
202        : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
203  };
204  
205  /// A chain (ordered sequence) of blocks.
206  class Chain {
207  public:
208    Chain(const Chain &) = delete;
209    Chain(Chain &&) = default;
210    Chain &operator=(const Chain &) = delete;
211    Chain &operator=(Chain &&) = default;
212  
213    explicit Chain(uint64_t Id, Block *Block)
214        : Id(Id), Score(0), Blocks(1, Block) {}
215  
216    uint64_t id() const { return Id; }
217  
218    bool isEntry() const { return Blocks[0]->Index == 0; }
219  
220    double score() const { return Score; }
221  
222    void setScore(double NewScore) { Score = NewScore; }
223  
224    const std::vector<Block *> &blocks() const { return Blocks; }
225  
226    const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
227      return Edges;
228    }
229  
230    ChainEdge *getEdge(Chain *Other) const {
231      for (auto It : Edges) {
232        if (It.first == Other)
233          return It.second;
234      }
235      return nullptr;
236    }
237  
238    void removeEdge(Chain *Other) {
239      auto It = Edges.begin();
240      while (It != Edges.end()) {
241        if (It->first == Other) {
242          Edges.erase(It);
243          return;
244        }
245        It++;
246      }
247    }
248  
249    void addEdge(Chain *Other, ChainEdge *Edge) {
250      Edges.push_back(std::make_pair(Other, Edge));
251    }
252  
253    void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
254      Blocks = MergedBlocks;
255      // Update the block's chains
256      for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
257        Blocks[Idx]->CurChain = this;
258        Blocks[Idx]->CurIndex = Idx;
259      }
260    }
261  
262    void mergeEdges(Chain *Other);
263  
264    void clear() {
265      Blocks.clear();
266      Blocks.shrink_to_fit();
267      Edges.clear();
268      Edges.shrink_to_fit();
269    }
270  
271  private:
272    // Unique chain identifier.
273    uint64_t Id;
274    // Cached ext-tsp score for the chain.
275    double Score;
276    // Blocks of the chain.
277    std::vector<Block *> Blocks;
278    // Adjacent chains and corresponding edges (lists of jumps).
279    std::vector<std::pair<Chain *, ChainEdge *>> Edges;
280  };
281  
282  /// An edge in CFG representing jumps between two chains.
283  /// When blocks are merged into chains, the edges are combined too so that
284  /// there is always at most one edge between a pair of chains
285  class ChainEdge {
286  public:
287    ChainEdge(const ChainEdge &) = delete;
288    ChainEdge(ChainEdge &&) = default;
289    ChainEdge &operator=(const ChainEdge &) = delete;
290    ChainEdge &operator=(ChainEdge &&) = default;
291  
292    explicit ChainEdge(Jump *Jump)
293        : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
294          Jumps(1, Jump) {}
295  
296    const std::vector<Jump *> &jumps() const { return Jumps; }
297  
298    void changeEndpoint(Chain *From, Chain *To) {
299      if (From == SrcChain)
300        SrcChain = To;
301      if (From == DstChain)
302        DstChain = To;
303    }
304  
305    void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
306  
307    void moveJumps(ChainEdge *Other) {
308      Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
309      Other->Jumps.clear();
310      Other->Jumps.shrink_to_fit();
311    }
312  
313    bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
314      return Src == SrcChain ? CacheValidForward : CacheValidBackward;
315    }
316  
317    MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
318      return Src == SrcChain ? CachedGainForward : CachedGainBackward;
319    }
320  
321    void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
322      if (Src == SrcChain) {
323        CachedGainForward = MergeGain;
324        CacheValidForward = true;
325      } else {
326        CachedGainBackward = MergeGain;
327        CacheValidBackward = true;
328      }
329    }
330  
331    void invalidateCache() {
332      CacheValidForward = false;
333      CacheValidBackward = false;
334    }
335  
336  private:
337    // Source chain.
338    Chain *SrcChain{nullptr};
339    // Destination chain.
340    Chain *DstChain{nullptr};
341    // Original jumps in the binary with correspinding execution counts.
342    std::vector<Jump *> Jumps;
343    // Cached ext-tsp value for merging the pair of chains.
344    // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
345    // we store both values here.
346    MergeGainTy CachedGainForward;
347    MergeGainTy CachedGainBackward;
348    // Whether the cached value must be recomputed.
349    bool CacheValidForward{false};
350    bool CacheValidBackward{false};
351  };
352  
353  void Chain::mergeEdges(Chain *Other) {
354    assert(this != Other && "cannot merge a chain with itself");
355  
356    // Update edges adjacent to chain Other
357    for (auto EdgeIt : Other->Edges) {
358      const auto DstChain = EdgeIt.first;
359      const auto DstEdge = EdgeIt.second;
360      const auto TargetChain = DstChain == Other ? this : DstChain;
361      auto CurEdge = getEdge(TargetChain);
362      if (CurEdge == nullptr) {
363        DstEdge->changeEndpoint(Other, this);
364        this->addEdge(TargetChain, DstEdge);
365        if (DstChain != this && DstChain != Other) {
366          DstChain->addEdge(this, DstEdge);
367        }
368      } else {
369        CurEdge->moveJumps(DstEdge);
370      }
371      // Cleanup leftover edge
372      if (DstChain != Other) {
373        DstChain->removeEdge(Other);
374      }
375    }
376  }
377  
378  using BlockIter = std::vector<Block *>::const_iterator;
379  
380  /// A wrapper around three chains of blocks; it is used to avoid extra
381  /// instantiation of the vectors.
382  class MergedChain {
383  public:
384    MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
385                BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
386                BlockIter End3 = BlockIter())
387        : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
388          End3(End3) {}
389  
390    template <typename F> void forEach(const F &Func) const {
391      for (auto It = Begin1; It != End1; It++)
392        Func(*It);
393      for (auto It = Begin2; It != End2; It++)
394        Func(*It);
395      for (auto It = Begin3; It != End3; It++)
396        Func(*It);
397    }
398  
399    std::vector<Block *> getBlocks() const {
400      std::vector<Block *> Result;
401      Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
402                     std::distance(Begin3, End3));
403      Result.insert(Result.end(), Begin1, End1);
404      Result.insert(Result.end(), Begin2, End2);
405      Result.insert(Result.end(), Begin3, End3);
406      return Result;
407    }
408  
409    const Block *getFirstBlock() const { return *Begin1; }
410  
411  private:
412    BlockIter Begin1;
413    BlockIter End1;
414    BlockIter Begin2;
415    BlockIter End2;
416    BlockIter Begin3;
417    BlockIter End3;
418  };
419  
420  /// The implementation of the ExtTSP algorithm.
421  class ExtTSPImpl {
422    using EdgeT = std::pair<uint64_t, uint64_t>;
423    using EdgeCountMap = DenseMap<EdgeT, uint64_t>;
424  
425  public:
426    ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
427               const std::vector<uint64_t> &NodeCounts,
428               const EdgeCountMap &EdgeCounts)
429        : NumNodes(NumNodes) {
430      initialize(NodeSizes, NodeCounts, EdgeCounts);
431    }
432  
433    /// Run the algorithm and return an optimized ordering of blocks.
434    void run(std::vector<uint64_t> &Result) {
435      // Pass 1: Merge blocks with their mutually forced successors
436      mergeForcedPairs();
437  
438      // Pass 2: Merge pairs of chains while improving the ExtTSP objective
439      mergeChainPairs();
440  
441      // Pass 3: Merge cold blocks to reduce code size
442      mergeColdChains();
443  
444      // Collect blocks from all chains
445      concatChains(Result);
446    }
447  
448  private:
449    /// Initialize the algorithm's data structures.
450    void initialize(const std::vector<uint64_t> &NodeSizes,
451                    const std::vector<uint64_t> &NodeCounts,
452                    const EdgeCountMap &EdgeCounts) {
453      // Initialize blocks
454      AllBlocks.reserve(NumNodes);
455      for (uint64_t Node = 0; Node < NumNodes; Node++) {
456        uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
457        uint64_t ExecutionCount = NodeCounts[Node];
458        // The execution count of the entry block is set to at least 1
459        if (Node == 0 && ExecutionCount == 0)
460          ExecutionCount = 1;
461        AllBlocks.emplace_back(Node, Size, ExecutionCount);
462      }
463  
464      // Initialize jumps between blocks
465      SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes);
466      PredNodes = std::vector<std::vector<uint64_t>>(NumNodes);
467      AllJumps.reserve(EdgeCounts.size());
468      for (auto It : EdgeCounts) {
469        auto Pred = It.first.first;
470        auto Succ = It.first.second;
471        // Ignore self-edges
472        if (Pred == Succ)
473          continue;
474  
475        SuccNodes[Pred].push_back(Succ);
476        PredNodes[Succ].push_back(Pred);
477        auto ExecutionCount = It.second;
478        if (ExecutionCount > 0) {
479          auto &Block = AllBlocks[Pred];
480          auto &SuccBlock = AllBlocks[Succ];
481          AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
482          SuccBlock.InJumps.push_back(&AllJumps.back());
483          Block.OutJumps.push_back(&AllJumps.back());
484        }
485      }
486  
487      // Initialize chains
488      AllChains.reserve(NumNodes);
489      HotChains.reserve(NumNodes);
490      for (auto &Block : AllBlocks) {
491        AllChains.emplace_back(Block.Index, &Block);
492        Block.CurChain = &AllChains.back();
493        if (Block.ExecutionCount > 0) {
494          HotChains.push_back(&AllChains.back());
495        }
496      }
497  
498      // Initialize chain edges
499      AllEdges.reserve(AllJumps.size());
500      for (auto &Block : AllBlocks) {
501        for (auto &Jump : Block.OutJumps) {
502          const auto SuccBlock = Jump->Target;
503          auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
504          // this edge is already present in the graph
505          if (CurEdge != nullptr) {
506            assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
507            CurEdge->appendJump(Jump);
508            continue;
509          }
510          // this is a new edge
511          AllEdges.emplace_back(Jump);
512          Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
513          SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
514        }
515      }
516    }
517  
518    /// For a pair of blocks, A and B, block B is the forced successor of A,
519    /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
520    /// to B are from A. Such blocks should be adjacent in the optimal ordering;
521    /// the method finds and merges such pairs of blocks.
522    void mergeForcedPairs() {
523      // Find fallthroughs based on edge weights
524      for (auto &Block : AllBlocks) {
525        if (SuccNodes[Block.Index].size() == 1 &&
526            PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
527            SuccNodes[Block.Index][0] != 0) {
528          size_t SuccIndex = SuccNodes[Block.Index][0];
529          Block.ForcedSucc = &AllBlocks[SuccIndex];
530          AllBlocks[SuccIndex].ForcedPred = &Block;
531        }
532      }
533  
534      // There might be 'cycles' in the forced dependencies, since profile
535      // data isn't 100% accurate. Typically this is observed in loops, when the
536      // loop edges are the hottest successors for the basic blocks of the loop.
537      // Break the cycles by choosing the block with the smallest index as the
538      // head. This helps to keep the original order of the loops, which likely
539      // have already been rotated in the optimized manner.
540      for (auto &Block : AllBlocks) {
541        if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
542          continue;
543  
544        auto SuccBlock = Block.ForcedSucc;
545        while (SuccBlock != nullptr && SuccBlock != &Block) {
546          SuccBlock = SuccBlock->ForcedSucc;
547        }
548        if (SuccBlock == nullptr)
549          continue;
550        // Break the cycle
551        AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
552        Block.ForcedPred = nullptr;
553      }
554  
555      // Merge blocks with their fallthrough successors
556      for (auto &Block : AllBlocks) {
557        if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
558          auto CurBlock = &Block;
559          while (CurBlock->ForcedSucc != nullptr) {
560            const auto NextBlock = CurBlock->ForcedSucc;
561            mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
562            CurBlock = NextBlock;
563          }
564        }
565      }
566    }
567  
568    /// Merge pairs of chains while improving the ExtTSP objective.
569    void mergeChainPairs() {
570      /// Deterministically compare pairs of chains
571      auto compareChainPairs = [](const Chain *A1, const Chain *B1,
572                                  const Chain *A2, const Chain *B2) {
573        if (A1 != A2)
574          return A1->id() < A2->id();
575        return B1->id() < B2->id();
576      };
577  
578      while (HotChains.size() > 1) {
579        Chain *BestChainPred = nullptr;
580        Chain *BestChainSucc = nullptr;
581        auto BestGain = MergeGainTy();
582        // Iterate over all pairs of chains
583        for (auto ChainPred : HotChains) {
584          // Get candidates for merging with the current chain
585          for (auto EdgeIter : ChainPred->edges()) {
586            auto ChainSucc = EdgeIter.first;
587            auto ChainEdge = EdgeIter.second;
588            // Ignore loop edges
589            if (ChainPred == ChainSucc)
590              continue;
591  
592            // Compute the gain of merging the two chains
593            auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
594            if (CurGain.score() <= EPS)
595              continue;
596  
597            if (BestGain < CurGain ||
598                (std::abs(CurGain.score() - BestGain.score()) < EPS &&
599                 compareChainPairs(ChainPred, ChainSucc, BestChainPred,
600                                   BestChainSucc))) {
601              BestGain = CurGain;
602              BestChainPred = ChainPred;
603              BestChainSucc = ChainSucc;
604            }
605          }
606        }
607  
608        // Stop merging when there is no improvement
609        if (BestGain.score() <= EPS)
610          break;
611  
612        // Merge the best pair of chains
613        mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
614                    BestGain.mergeType());
615      }
616    }
617  
618    /// Merge cold blocks to reduce code size.
619    void mergeColdChains() {
620      for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
621        // Iterating over neighbors in the reverse order to make sure original
622        // fallthrough jumps are merged first
623        size_t NumSuccs = SuccNodes[SrcBB].size();
624        for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
625          auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
626          auto SrcChain = AllBlocks[SrcBB].CurChain;
627          auto DstChain = AllBlocks[DstBB].CurChain;
628          if (SrcChain != DstChain && !DstChain->isEntry() &&
629              SrcChain->blocks().back()->Index == SrcBB &&
630              DstChain->blocks().front()->Index == DstBB) {
631            mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
632          }
633        }
634      }
635    }
636  
637    /// Compute the Ext-TSP score for a given block order and a list of jumps.
638    double extTSPScore(const MergedChain &MergedBlocks,
639                       const std::vector<Jump *> &Jumps) const {
640      if (Jumps.empty())
641        return 0.0;
642      uint64_t CurAddr = 0;
643      MergedBlocks.forEach([&](const Block *BB) {
644        BB->EstimatedAddr = CurAddr;
645        CurAddr += BB->Size;
646      });
647  
648      double Score = 0;
649      for (auto &Jump : Jumps) {
650        const auto SrcBlock = Jump->Source;
651        const auto DstBlock = Jump->Target;
652        Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
653                               DstBlock->EstimatedAddr, Jump->ExecutionCount);
654      }
655      return Score;
656    }
657  
658    /// Compute the gain of merging two chains.
659    ///
660    /// The function considers all possible ways of merging two chains and
661    /// computes the one having the largest increase in ExtTSP objective. The
662    /// result is a pair with the first element being the gain and the second
663    /// element being the corresponding merging type.
664    MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
665                                 ChainEdge *Edge) const {
666      if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
667        return Edge->getCachedMergeGain(ChainPred, ChainSucc);
668      }
669  
670      // Precompute jumps between ChainPred and ChainSucc
671      auto Jumps = Edge->jumps();
672      auto EdgePP = ChainPred->getEdge(ChainPred);
673      if (EdgePP != nullptr) {
674        Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
675      }
676      assert(!Jumps.empty() && "trying to merge chains w/o jumps");
677  
678      // The object holds the best currently chosen gain of merging the two chains
679      MergeGainTy Gain = MergeGainTy();
680  
681      /// Given a merge offset and a list of merge types, try to merge two chains
682      /// and update Gain with a better alternative
683      auto tryChainMerging = [&](size_t Offset,
684                                 const std::vector<MergeTypeTy> &MergeTypes) {
685        // Skip merging corresponding to concatenation w/o splitting
686        if (Offset == 0 || Offset == ChainPred->blocks().size())
687          return;
688        // Skip merging if it breaks Forced successors
689        auto BB = ChainPred->blocks()[Offset - 1];
690        if (BB->ForcedSucc != nullptr)
691          return;
692        // Apply the merge, compute the corresponding gain, and update the best
693        // value, if the merge is beneficial
694        for (auto &MergeType : MergeTypes) {
695          Gain.updateIfLessThan(
696              computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
697        }
698      };
699  
700      // Try to concatenate two chains w/o splitting
701      Gain.updateIfLessThan(
702          computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
703  
704      if (EnableChainSplitAlongJumps) {
705        // Attach (a part of) ChainPred before the first block of ChainSucc
706        for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
707          const auto SrcBlock = Jump->Source;
708          if (SrcBlock->CurChain != ChainPred)
709            continue;
710          size_t Offset = SrcBlock->CurIndex + 1;
711          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
712        }
713  
714        // Attach (a part of) ChainPred after the last block of ChainSucc
715        for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
716          const auto DstBlock = Jump->Source;
717          if (DstBlock->CurChain != ChainPred)
718            continue;
719          size_t Offset = DstBlock->CurIndex;
720          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
721        }
722      }
723  
724      // Try to break ChainPred in various ways and concatenate with ChainSucc
725      if (ChainPred->blocks().size() <= ChainSplitThreshold) {
726        for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
727          // Try to split the chain in different ways. In practice, applying
728          // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
729          // it from consideration to reduce the search space
730          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
731                                   MergeTypeTy::X2_X1_Y});
732        }
733      }
734      Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
735      return Gain;
736    }
737  
738    /// Compute the score gain of merging two chains, respecting a given
739    /// merge 'type' and 'offset'.
740    ///
741    /// The two chains are not modified in the method.
742    MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
743                                 const std::vector<Jump *> &Jumps,
744                                 size_t MergeOffset,
745                                 MergeTypeTy MergeType) const {
746      auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
747                                      MergeOffset, MergeType);
748  
749      // Do not allow a merge that does not preserve the original entry block
750      if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
751          !MergedBlocks.getFirstBlock()->isEntry())
752        return MergeGainTy();
753  
754      // The gain for the new chain
755      auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
756      return MergeGainTy(NewGainScore, MergeOffset, MergeType);
757    }
758  
759    /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
760    ///
761    /// If MergeType == 0, then the result is a concatentation of two chains.
762    /// Otherwise, the first chain is cut into two sub-chains at the offset,
763    /// and merged using all possible ways of concatenating three chains.
764    MergedChain mergeBlocks(const std::vector<Block *> &X,
765                            const std::vector<Block *> &Y, size_t MergeOffset,
766                            MergeTypeTy MergeType) const {
767      // Split the first chain, X, into X1 and X2
768      BlockIter BeginX1 = X.begin();
769      BlockIter EndX1 = X.begin() + MergeOffset;
770      BlockIter BeginX2 = X.begin() + MergeOffset;
771      BlockIter EndX2 = X.end();
772      BlockIter BeginY = Y.begin();
773      BlockIter EndY = Y.end();
774  
775      // Construct a new chain from the three existing ones
776      switch (MergeType) {
777      case MergeTypeTy::X_Y:
778        return MergedChain(BeginX1, EndX2, BeginY, EndY);
779      case MergeTypeTy::X1_Y_X2:
780        return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
781      case MergeTypeTy::Y_X2_X1:
782        return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
783      case MergeTypeTy::X2_X1_Y:
784        return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
785      }
786      llvm_unreachable("unexpected chain merge type");
787    }
788  
789    /// Merge chain From into chain Into, update the list of active chains,
790    /// adjacency information, and the corresponding cached values.
791    void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
792                     MergeTypeTy MergeType) {
793      assert(Into != From && "a chain cannot be merged with itself");
794  
795      // Merge the blocks
796      auto MergedBlocks =
797          mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
798      Into->merge(From, MergedBlocks.getBlocks());
799      Into->mergeEdges(From);
800      From->clear();
801  
802      // Update cached ext-tsp score for the new chain
803      auto SelfEdge = Into->getEdge(Into);
804      if (SelfEdge != nullptr) {
805        MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
806        Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
807      }
808  
809      // Remove chain From from the list of active chains
810      auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
811      HotChains.erase(Iter, HotChains.end());
812  
813      // Invalidate caches
814      for (auto EdgeIter : Into->edges()) {
815        EdgeIter.second->invalidateCache();
816      }
817    }
818  
819    /// Concatenate all chains into a final order of blocks.
820    void concatChains(std::vector<uint64_t> &Order) {
821      // Collect chains and calculate some stats for their sorting
822      std::vector<Chain *> SortedChains;
823      DenseMap<const Chain *, double> ChainDensity;
824      for (auto &Chain : AllChains) {
825        if (!Chain.blocks().empty()) {
826          SortedChains.push_back(&Chain);
827          // Using doubles to avoid overflow of ExecutionCount
828          double Size = 0;
829          double ExecutionCount = 0;
830          for (auto Block : Chain.blocks()) {
831            Size += static_cast<double>(Block->Size);
832            ExecutionCount += static_cast<double>(Block->ExecutionCount);
833          }
834          assert(Size > 0 && "a chain of zero size");
835          ChainDensity[&Chain] = ExecutionCount / Size;
836        }
837      }
838  
839      // Sorting chains by density in the decreasing order
840      std::stable_sort(SortedChains.begin(), SortedChains.end(),
841                       [&](const Chain *C1, const Chain *C2) {
842                         // Makre sure the original entry block is at the
843                         // beginning of the order
844                         if (C1->isEntry() != C2->isEntry()) {
845                           return C1->isEntry();
846                         }
847  
848                         const double D1 = ChainDensity[C1];
849                         const double D2 = ChainDensity[C2];
850                         // Compare by density and break ties by chain identifiers
851                         return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
852                       });
853  
854      // Collect the blocks in the order specified by their chains
855      Order.reserve(NumNodes);
856      for (auto Chain : SortedChains) {
857        for (auto Block : Chain->blocks()) {
858          Order.push_back(Block->Index);
859        }
860      }
861    }
862  
863  private:
864    /// The number of nodes in the graph.
865    const size_t NumNodes;
866  
867    /// Successors of each node.
868    std::vector<std::vector<uint64_t>> SuccNodes;
869  
870    /// Predecessors of each node.
871    std::vector<std::vector<uint64_t>> PredNodes;
872  
873    /// All basic blocks.
874    std::vector<Block> AllBlocks;
875  
876    /// All jumps between blocks.
877    std::vector<Jump> AllJumps;
878  
879    /// All chains of basic blocks.
880    std::vector<Chain> AllChains;
881  
882    /// All edges between chains.
883    std::vector<ChainEdge> AllEdges;
884  
885    /// Active chains. The vector gets updated at runtime when chains are merged.
886    std::vector<Chain *> HotChains;
887  };
888  
889  } // end of anonymous namespace
890  
891  std::vector<uint64_t> llvm::applyExtTspLayout(
892      const std::vector<uint64_t> &NodeSizes,
893      const std::vector<uint64_t> &NodeCounts,
894      const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
895    size_t NumNodes = NodeSizes.size();
896  
897    // Verify correctness of the input data.
898    assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
899    assert(NumNodes > 2 && "Incorrect input");
900  
901    // Apply the reordering algorithm.
902    auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
903    std::vector<uint64_t> Result;
904    Alg.run(Result);
905  
906    // Verify correctness of the output.
907    assert(Result.front() == 0 && "Original entry point is not preserved");
908    assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
909    return Result;
910  }
911  
912  double llvm::calcExtTspScore(
913      const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
914      const std::vector<uint64_t> &NodeCounts,
915      const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
916    // Estimate addresses of the blocks in memory
917    auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0);
918    for (size_t Idx = 1; Idx < Order.size(); Idx++) {
919      Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
920    }
921  
922    // Increase the score for each jump
923    double Score = 0;
924    for (auto It : EdgeCounts) {
925      auto Pred = It.first.first;
926      auto Succ = It.first.second;
927      uint64_t Count = It.second;
928      Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count);
929    }
930    return Score;
931  }
932  
933  double llvm::calcExtTspScore(
934      const std::vector<uint64_t> &NodeSizes,
935      const std::vector<uint64_t> &NodeCounts,
936      const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
937    auto Order = std::vector<uint64_t>(NodeSizes.size());
938    for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
939      Order[Idx] = Idx;
940    }
941    return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
942  }
943