xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision 3541d90836c0dde9734ea776f2b2b6c4ed8fd7f4)
1  //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // ExtTSP - layout of basic blocks with i-cache optimization.
10  //
11  // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
12  // optimizing jump locality and thus processor I-cache utilization. This is
13  // achieved via increasing the number of fall-through jumps and co-locating
14  // frequently executed nodes together. The name follows the underlying
15  // optimization problem, Extended-TSP, which is a generalization of classical
16  // (maximum) Traveling Salesmen Problem.
17  //
18  // The algorithm is a greedy heuristic that works with chains (ordered lists)
19  // of basic blocks. Initially all chains are isolated basic blocks. On every
20  // iteration, we pick a pair of chains whose merging yields the biggest increase
21  // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
22  // A pair of chains giving the maximum gain is merged into a new chain. The
23  // procedure stops when there is only one chain left, or when merging does not
24  // increase ExtTSP. In the latter case, the remaining chains are sorted by
25  // density in the decreasing order.
26  //
27  // An important aspect is the way two chains are merged. Unlike earlier
28  // algorithms (e.g., based on the approach of Pettis-Hansen), two
29  // chains, X and Y, are first split into three, X1, X2, and Y. Then we
30  // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
31  // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
32  // This improves the quality of the final result (the search space is larger)
33  // while keeping the implementation sufficiently fast.
34  //
35  // Reference:
36  //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
37  //     IEEE Transactions on Computers, 2020
38  //     https://arxiv.org/abs/1809.04676
39  //
40  //===----------------------------------------------------------------------===//
41  
42  #include "llvm/Transforms/Utils/CodeLayout.h"
43  #include "llvm/Support/CommandLine.h"
44  
45  #include <cmath>
46  
47  using namespace llvm;
48  #define DEBUG_TYPE "code-layout"
49  
50  cl::opt<bool> EnableExtTspBlockPlacement(
51      "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
52      cl::desc("Enable machine block placement based on the ext-tsp model, "
53               "optimizing I-cache utilization."));
54  
55  cl::opt<bool> ApplyExtTspWithoutProfile(
56      "ext-tsp-apply-without-profile",
57      cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
58      cl::init(true), cl::Hidden);
59  
60  // Algorithm-specific params. The values are tuned for the best performance
61  // of large-scale front-end bound binaries.
62  static cl::opt<double> ForwardWeightCond(
63      "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
64      cl::desc("The weight of conditional forward jumps for ExtTSP value"));
65  
66  static cl::opt<double> ForwardWeightUncond(
67      "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
68      cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
69  
70  static cl::opt<double> BackwardWeightCond(
71      "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
72      cl::desc("The weight of conditonal backward jumps for ExtTSP value"));
73  
74  static cl::opt<double> BackwardWeightUncond(
75      "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
76      cl::desc("The weight of unconditonal backward jumps for ExtTSP value"));
77  
78  static cl::opt<double> FallthroughWeightCond(
79      "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
80      cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
81  
82  static cl::opt<double> FallthroughWeightUncond(
83      "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
84      cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
85  
86  static cl::opt<unsigned> ForwardDistance(
87      "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
88      cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
89  
90  static cl::opt<unsigned> BackwardDistance(
91      "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
92      cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
93  
94  // The maximum size of a chain created by the algorithm. The size is bounded
95  // so that the algorithm can efficiently process extremely large instance.
96  static cl::opt<unsigned>
97      MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
98                   cl::desc("The maximum size of a chain to create."));
99  
100  // The maximum size of a chain for splitting. Larger values of the threshold
101  // may yield better quality at the cost of worsen run-time.
102  static cl::opt<unsigned> ChainSplitThreshold(
103      "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
104      cl::desc("The maximum size of a chain to apply splitting"));
105  
106  // The option enables splitting (large) chains along in-coming and out-going
107  // jumps. This typically results in a better quality.
108  static cl::opt<bool> EnableChainSplitAlongJumps(
109      "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
110      cl::desc("The maximum size of a chain to apply splitting"));
111  
112  namespace {
113  
114  // Epsilon for comparison of doubles.
115  constexpr double EPS = 1e-8;
116  
117  // Compute the Ext-TSP score for a given jump.
118  double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
119                         double Weight) {
120    if (JumpDist > JumpMaxDist)
121      return 0;
122    double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
123    return Weight * Prob * Count;
124  }
125  
126  // Compute the Ext-TSP score for a jump between a given pair of blocks,
127  // using their sizes, (estimated) addresses and the jump execution count.
128  double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
129                     uint64_t Count, bool IsConditional) {
130    // Fallthrough
131    if (SrcAddr + SrcSize == DstAddr) {
132      return jumpExtTSPScore(0, 1, Count,
133                             IsConditional ? FallthroughWeightCond
134                                           : FallthroughWeightUncond);
135    }
136    // Forward
137    if (SrcAddr + SrcSize < DstAddr) {
138      const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
139      return jumpExtTSPScore(Dist, ForwardDistance, Count,
140                             IsConditional ? ForwardWeightCond
141                                           : ForwardWeightUncond);
142    }
143    // Backward
144    const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
145    return jumpExtTSPScore(Dist, BackwardDistance, Count,
146                           IsConditional ? BackwardWeightCond
147                                         : BackwardWeightUncond);
148  }
149  
150  /// A type of merging two chains, X and Y. The former chain is split into
151  /// X1 and X2 and then concatenated with Y in the order specified by the type.
152  enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
153  
154  /// The gain of merging two chains, that is, the Ext-TSP score of the merge
155  /// together with the corresponfiding merge 'type' and 'offset'.
156  class MergeGainTy {
157  public:
158    explicit MergeGainTy() = default;
159    explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
160        : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
161  
162    double score() const { return Score; }
163  
164    size_t mergeOffset() const { return MergeOffset; }
165  
166    MergeTypeTy mergeType() const { return MergeType; }
167  
168    // Returns 'true' iff Other is preferred over this.
169    bool operator<(const MergeGainTy &Other) const {
170      return (Other.Score > EPS && Other.Score > Score + EPS);
171    }
172  
173    // Update the current gain if Other is preferred over this.
174    void updateIfLessThan(const MergeGainTy &Other) {
175      if (*this < Other)
176        *this = Other;
177    }
178  
179  private:
180    double Score{-1.0};
181    size_t MergeOffset{0};
182    MergeTypeTy MergeType{MergeTypeTy::X_Y};
183  };
184  
185  class Jump;
186  class Chain;
187  class ChainEdge;
188  
189  /// A node in the graph, typically corresponding to a basic block in CFG.
190  class Block {
191  public:
192    Block(const Block &) = delete;
193    Block(Block &&) = default;
194    Block &operator=(const Block &) = delete;
195    Block &operator=(Block &&) = default;
196  
197    // The original index of the block in CFG.
198    size_t Index{0};
199    // The index of the block in the current chain.
200    size_t CurIndex{0};
201    // Size of the block in the binary.
202    uint64_t Size{0};
203    // Execution count of the block in the profile data.
204    uint64_t ExecutionCount{0};
205    // Current chain of the node.
206    Chain *CurChain{nullptr};
207    // An offset of the block in the current chain.
208    mutable uint64_t EstimatedAddr{0};
209    // Forced successor of the block in CFG.
210    Block *ForcedSucc{nullptr};
211    // Forced predecessor of the block in CFG.
212    Block *ForcedPred{nullptr};
213    // Outgoing jumps from the block.
214    std::vector<Jump *> OutJumps;
215    // Incoming jumps to the block.
216    std::vector<Jump *> InJumps;
217  
218  public:
219    explicit Block(size_t Index, uint64_t Size, uint64_t EC)
220        : Index(Index), Size(Size), ExecutionCount(EC) {}
221    bool isEntry() const { return Index == 0; }
222  };
223  
224  /// An arc in the graph, typically corresponding to a jump between two blocks.
225  class Jump {
226  public:
227    Jump(const Jump &) = delete;
228    Jump(Jump &&) = default;
229    Jump &operator=(const Jump &) = delete;
230    Jump &operator=(Jump &&) = default;
231  
232    // Source block of the jump.
233    Block *Source;
234    // Target block of the jump.
235    Block *Target;
236    // Execution count of the arc in the profile data.
237    uint64_t ExecutionCount{0};
238    // Whether the jump corresponds to a conditional branch.
239    bool IsConditional{false};
240  
241  public:
242    explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
243        : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
244  };
245  
246  /// A chain (ordered sequence) of blocks.
247  class Chain {
248  public:
249    Chain(const Chain &) = delete;
250    Chain(Chain &&) = default;
251    Chain &operator=(const Chain &) = delete;
252    Chain &operator=(Chain &&) = default;
253  
254    explicit Chain(uint64_t Id, Block *Block)
255        : Id(Id), Score(0), Blocks(1, Block) {}
256  
257    uint64_t id() const { return Id; }
258  
259    bool isEntry() const { return Blocks[0]->Index == 0; }
260  
261    bool isCold() const {
262      for (auto *Block : Blocks) {
263        if (Block->ExecutionCount > 0)
264          return false;
265      }
266      return true;
267    }
268  
269    double score() const { return Score; }
270  
271    void setScore(double NewScore) { Score = NewScore; }
272  
273    const std::vector<Block *> &blocks() const { return Blocks; }
274  
275    size_t numBlocks() const { return Blocks.size(); }
276  
277    const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
278      return Edges;
279    }
280  
281    ChainEdge *getEdge(Chain *Other) const {
282      for (auto It : Edges) {
283        if (It.first == Other)
284          return It.second;
285      }
286      return nullptr;
287    }
288  
289    void removeEdge(Chain *Other) {
290      auto It = Edges.begin();
291      while (It != Edges.end()) {
292        if (It->first == Other) {
293          Edges.erase(It);
294          return;
295        }
296        It++;
297      }
298    }
299  
300    void addEdge(Chain *Other, ChainEdge *Edge) {
301      Edges.push_back(std::make_pair(Other, Edge));
302    }
303  
304    void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
305      Blocks = MergedBlocks;
306      // Update the block's chains
307      for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
308        Blocks[Idx]->CurChain = this;
309        Blocks[Idx]->CurIndex = Idx;
310      }
311    }
312  
313    void mergeEdges(Chain *Other);
314  
315    void clear() {
316      Blocks.clear();
317      Blocks.shrink_to_fit();
318      Edges.clear();
319      Edges.shrink_to_fit();
320    }
321  
322  private:
323    // Unique chain identifier.
324    uint64_t Id;
325    // Cached ext-tsp score for the chain.
326    double Score;
327    // Blocks of the chain.
328    std::vector<Block *> Blocks;
329    // Adjacent chains and corresponding edges (lists of jumps).
330    std::vector<std::pair<Chain *, ChainEdge *>> Edges;
331  };
332  
333  /// An edge in CFG representing jumps between two chains.
334  /// When blocks are merged into chains, the edges are combined too so that
335  /// there is always at most one edge between a pair of chains
336  class ChainEdge {
337  public:
338    ChainEdge(const ChainEdge &) = delete;
339    ChainEdge(ChainEdge &&) = default;
340    ChainEdge &operator=(const ChainEdge &) = delete;
341    ChainEdge &operator=(ChainEdge &&) = default;
342  
343    explicit ChainEdge(Jump *Jump)
344        : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
345          Jumps(1, Jump) {}
346  
347    const std::vector<Jump *> &jumps() const { return Jumps; }
348  
349    void changeEndpoint(Chain *From, Chain *To) {
350      if (From == SrcChain)
351        SrcChain = To;
352      if (From == DstChain)
353        DstChain = To;
354    }
355  
356    void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
357  
358    void moveJumps(ChainEdge *Other) {
359      Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
360      Other->Jumps.clear();
361      Other->Jumps.shrink_to_fit();
362    }
363  
364    bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
365      return Src == SrcChain ? CacheValidForward : CacheValidBackward;
366    }
367  
368    MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
369      return Src == SrcChain ? CachedGainForward : CachedGainBackward;
370    }
371  
372    void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
373      if (Src == SrcChain) {
374        CachedGainForward = MergeGain;
375        CacheValidForward = true;
376      } else {
377        CachedGainBackward = MergeGain;
378        CacheValidBackward = true;
379      }
380    }
381  
382    void invalidateCache() {
383      CacheValidForward = false;
384      CacheValidBackward = false;
385    }
386  
387  private:
388    // Source chain.
389    Chain *SrcChain{nullptr};
390    // Destination chain.
391    Chain *DstChain{nullptr};
392    // Original jumps in the binary with correspinding execution counts.
393    std::vector<Jump *> Jumps;
394    // Cached ext-tsp value for merging the pair of chains.
395    // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
396    // we store both values here.
397    MergeGainTy CachedGainForward;
398    MergeGainTy CachedGainBackward;
399    // Whether the cached value must be recomputed.
400    bool CacheValidForward{false};
401    bool CacheValidBackward{false};
402  };
403  
404  void Chain::mergeEdges(Chain *Other) {
405    assert(this != Other && "cannot merge a chain with itself");
406  
407    // Update edges adjacent to chain Other
408    for (auto EdgeIt : Other->Edges) {
409      Chain *DstChain = EdgeIt.first;
410      ChainEdge *DstEdge = EdgeIt.second;
411      Chain *TargetChain = DstChain == Other ? this : DstChain;
412      ChainEdge *CurEdge = getEdge(TargetChain);
413      if (CurEdge == nullptr) {
414        DstEdge->changeEndpoint(Other, this);
415        this->addEdge(TargetChain, DstEdge);
416        if (DstChain != this && DstChain != Other) {
417          DstChain->addEdge(this, DstEdge);
418        }
419      } else {
420        CurEdge->moveJumps(DstEdge);
421      }
422      // Cleanup leftover edge
423      if (DstChain != Other) {
424        DstChain->removeEdge(Other);
425      }
426    }
427  }
428  
429  using BlockIter = std::vector<Block *>::const_iterator;
430  
431  /// A wrapper around three chains of blocks; it is used to avoid extra
432  /// instantiation of the vectors.
433  class MergedChain {
434  public:
435    MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
436                BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
437                BlockIter End3 = BlockIter())
438        : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
439          End3(End3) {}
440  
441    template <typename F> void forEach(const F &Func) const {
442      for (auto It = Begin1; It != End1; It++)
443        Func(*It);
444      for (auto It = Begin2; It != End2; It++)
445        Func(*It);
446      for (auto It = Begin3; It != End3; It++)
447        Func(*It);
448    }
449  
450    std::vector<Block *> getBlocks() const {
451      std::vector<Block *> Result;
452      Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
453                     std::distance(Begin3, End3));
454      Result.insert(Result.end(), Begin1, End1);
455      Result.insert(Result.end(), Begin2, End2);
456      Result.insert(Result.end(), Begin3, End3);
457      return Result;
458    }
459  
460    const Block *getFirstBlock() const { return *Begin1; }
461  
462  private:
463    BlockIter Begin1;
464    BlockIter End1;
465    BlockIter Begin2;
466    BlockIter End2;
467    BlockIter Begin3;
468    BlockIter End3;
469  };
470  
471  /// The implementation of the ExtTSP algorithm.
472  class ExtTSPImpl {
473    using EdgeT = std::pair<uint64_t, uint64_t>;
474    using EdgeCountMap = std::vector<std::pair<EdgeT, uint64_t>>;
475  
476  public:
477    ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
478               const std::vector<uint64_t> &NodeCounts,
479               const EdgeCountMap &EdgeCounts)
480        : NumNodes(NumNodes) {
481      initialize(NodeSizes, NodeCounts, EdgeCounts);
482    }
483  
484    /// Run the algorithm and return an optimized ordering of blocks.
485    void run(std::vector<uint64_t> &Result) {
486      // Pass 1: Merge blocks with their mutually forced successors
487      mergeForcedPairs();
488  
489      // Pass 2: Merge pairs of chains while improving the ExtTSP objective
490      mergeChainPairs();
491  
492      // Pass 3: Merge cold blocks to reduce code size
493      mergeColdChains();
494  
495      // Collect blocks from all chains
496      concatChains(Result);
497    }
498  
499  private:
500    /// Initialize the algorithm's data structures.
501    void initialize(const std::vector<uint64_t> &NodeSizes,
502                    const std::vector<uint64_t> &NodeCounts,
503                    const EdgeCountMap &EdgeCounts) {
504      // Initialize blocks
505      AllBlocks.reserve(NumNodes);
506      for (uint64_t Node = 0; Node < NumNodes; Node++) {
507        uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
508        uint64_t ExecutionCount = NodeCounts[Node];
509        // The execution count of the entry block is set to at least 1
510        if (Node == 0 && ExecutionCount == 0)
511          ExecutionCount = 1;
512        AllBlocks.emplace_back(Node, Size, ExecutionCount);
513      }
514  
515      // Initialize jumps between blocks
516      SuccNodes.resize(NumNodes);
517      PredNodes.resize(NumNodes);
518      std::vector<uint64_t> OutDegree(NumNodes, 0);
519      AllJumps.reserve(EdgeCounts.size());
520      for (auto It : EdgeCounts) {
521        auto Pred = It.first.first;
522        auto Succ = It.first.second;
523        OutDegree[Pred]++;
524        // Ignore self-edges
525        if (Pred == Succ)
526          continue;
527  
528        SuccNodes[Pred].push_back(Succ);
529        PredNodes[Succ].push_back(Pred);
530        auto ExecutionCount = It.second;
531        if (ExecutionCount > 0) {
532          auto &Block = AllBlocks[Pred];
533          auto &SuccBlock = AllBlocks[Succ];
534          AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
535          SuccBlock.InJumps.push_back(&AllJumps.back());
536          Block.OutJumps.push_back(&AllJumps.back());
537        }
538      }
539      for (auto &Jump : AllJumps) {
540        assert(OutDegree[Jump.Source->Index] > 0);
541        Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
542      }
543  
544      // Initialize chains
545      AllChains.reserve(NumNodes);
546      HotChains.reserve(NumNodes);
547      for (Block &Block : AllBlocks) {
548        AllChains.emplace_back(Block.Index, &Block);
549        Block.CurChain = &AllChains.back();
550        if (Block.ExecutionCount > 0) {
551          HotChains.push_back(&AllChains.back());
552        }
553      }
554  
555      // Initialize chain edges
556      AllEdges.reserve(AllJumps.size());
557      for (Block &Block : AllBlocks) {
558        for (auto &Jump : Block.OutJumps) {
559          auto SuccBlock = Jump->Target;
560          ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
561          // this edge is already present in the graph
562          if (CurEdge != nullptr) {
563            assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
564            CurEdge->appendJump(Jump);
565            continue;
566          }
567          // this is a new edge
568          AllEdges.emplace_back(Jump);
569          Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
570          SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
571        }
572      }
573    }
574  
575    /// For a pair of blocks, A and B, block B is the forced successor of A,
576    /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
577    /// to B are from A. Such blocks should be adjacent in the optimal ordering;
578    /// the method finds and merges such pairs of blocks.
579    void mergeForcedPairs() {
580      // Find fallthroughs based on edge weights
581      for (auto &Block : AllBlocks) {
582        if (SuccNodes[Block.Index].size() == 1 &&
583            PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
584            SuccNodes[Block.Index][0] != 0) {
585          size_t SuccIndex = SuccNodes[Block.Index][0];
586          Block.ForcedSucc = &AllBlocks[SuccIndex];
587          AllBlocks[SuccIndex].ForcedPred = &Block;
588        }
589      }
590  
591      // There might be 'cycles' in the forced dependencies, since profile
592      // data isn't 100% accurate. Typically this is observed in loops, when the
593      // loop edges are the hottest successors for the basic blocks of the loop.
594      // Break the cycles by choosing the block with the smallest index as the
595      // head. This helps to keep the original order of the loops, which likely
596      // have already been rotated in the optimized manner.
597      for (auto &Block : AllBlocks) {
598        if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
599          continue;
600  
601        auto SuccBlock = Block.ForcedSucc;
602        while (SuccBlock != nullptr && SuccBlock != &Block) {
603          SuccBlock = SuccBlock->ForcedSucc;
604        }
605        if (SuccBlock == nullptr)
606          continue;
607        // Break the cycle
608        AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
609        Block.ForcedPred = nullptr;
610      }
611  
612      // Merge blocks with their fallthrough successors
613      for (auto &Block : AllBlocks) {
614        if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
615          auto CurBlock = &Block;
616          while (CurBlock->ForcedSucc != nullptr) {
617            const auto NextBlock = CurBlock->ForcedSucc;
618            mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
619            CurBlock = NextBlock;
620          }
621        }
622      }
623    }
624  
625    /// Merge pairs of chains while improving the ExtTSP objective.
626    void mergeChainPairs() {
627      /// Deterministically compare pairs of chains
628      auto compareChainPairs = [](const Chain *A1, const Chain *B1,
629                                  const Chain *A2, const Chain *B2) {
630        if (A1 != A2)
631          return A1->id() < A2->id();
632        return B1->id() < B2->id();
633      };
634  
635      while (HotChains.size() > 1) {
636        Chain *BestChainPred = nullptr;
637        Chain *BestChainSucc = nullptr;
638        auto BestGain = MergeGainTy();
639        // Iterate over all pairs of chains
640        for (Chain *ChainPred : HotChains) {
641          // Get candidates for merging with the current chain
642          for (auto EdgeIter : ChainPred->edges()) {
643            Chain *ChainSucc = EdgeIter.first;
644            class ChainEdge *ChainEdge = EdgeIter.second;
645            // Ignore loop edges
646            if (ChainPred == ChainSucc)
647              continue;
648  
649            // Stop early if the combined chain violates the maximum allowed size
650            if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
651              continue;
652  
653            // Compute the gain of merging the two chains
654            MergeGainTy CurGain =
655                getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
656            if (CurGain.score() <= EPS)
657              continue;
658  
659            if (BestGain < CurGain ||
660                (std::abs(CurGain.score() - BestGain.score()) < EPS &&
661                 compareChainPairs(ChainPred, ChainSucc, BestChainPred,
662                                   BestChainSucc))) {
663              BestGain = CurGain;
664              BestChainPred = ChainPred;
665              BestChainSucc = ChainSucc;
666            }
667          }
668        }
669  
670        // Stop merging when there is no improvement
671        if (BestGain.score() <= EPS)
672          break;
673  
674        // Merge the best pair of chains
675        mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
676                    BestGain.mergeType());
677      }
678    }
679  
680    /// Merge remaining blocks into chains w/o taking jump counts into
681    /// consideration. This allows to maintain the original block order in the
682    /// absense of profile data
683    void mergeColdChains() {
684      for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
685        // Iterating in reverse order to make sure original fallthrough jumps are
686        // merged first; this might be beneficial for code size.
687        size_t NumSuccs = SuccNodes[SrcBB].size();
688        for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
689          auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
690          auto SrcChain = AllBlocks[SrcBB].CurChain;
691          auto DstChain = AllBlocks[DstBB].CurChain;
692          if (SrcChain != DstChain && !DstChain->isEntry() &&
693              SrcChain->blocks().back()->Index == SrcBB &&
694              DstChain->blocks().front()->Index == DstBB &&
695              SrcChain->isCold() == DstChain->isCold()) {
696            mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
697          }
698        }
699      }
700    }
701  
702    /// Compute the Ext-TSP score for a given block order and a list of jumps.
703    double extTSPScore(const MergedChain &MergedBlocks,
704                       const std::vector<Jump *> &Jumps) const {
705      if (Jumps.empty())
706        return 0.0;
707      uint64_t CurAddr = 0;
708      MergedBlocks.forEach([&](const Block *BB) {
709        BB->EstimatedAddr = CurAddr;
710        CurAddr += BB->Size;
711      });
712  
713      double Score = 0;
714      for (auto &Jump : Jumps) {
715        const Block *SrcBlock = Jump->Source;
716        const Block *DstBlock = Jump->Target;
717        Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
718                               DstBlock->EstimatedAddr, Jump->ExecutionCount,
719                               Jump->IsConditional);
720      }
721      return Score;
722    }
723  
724    /// Compute the gain of merging two chains.
725    ///
726    /// The function considers all possible ways of merging two chains and
727    /// computes the one having the largest increase in ExtTSP objective. The
728    /// result is a pair with the first element being the gain and the second
729    /// element being the corresponding merging type.
730    MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
731                                 ChainEdge *Edge) const {
732      if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
733        return Edge->getCachedMergeGain(ChainPred, ChainSucc);
734      }
735  
736      // Precompute jumps between ChainPred and ChainSucc
737      auto Jumps = Edge->jumps();
738      ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
739      if (EdgePP != nullptr) {
740        Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
741      }
742      assert(!Jumps.empty() && "trying to merge chains w/o jumps");
743  
744      // The object holds the best currently chosen gain of merging the two chains
745      MergeGainTy Gain = MergeGainTy();
746  
747      /// Given a merge offset and a list of merge types, try to merge two chains
748      /// and update Gain with a better alternative
749      auto tryChainMerging = [&](size_t Offset,
750                                 const std::vector<MergeTypeTy> &MergeTypes) {
751        // Skip merging corresponding to concatenation w/o splitting
752        if (Offset == 0 || Offset == ChainPred->blocks().size())
753          return;
754        // Skip merging if it breaks Forced successors
755        auto BB = ChainPred->blocks()[Offset - 1];
756        if (BB->ForcedSucc != nullptr)
757          return;
758        // Apply the merge, compute the corresponding gain, and update the best
759        // value, if the merge is beneficial
760        for (const auto &MergeType : MergeTypes) {
761          Gain.updateIfLessThan(
762              computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
763        }
764      };
765  
766      // Try to concatenate two chains w/o splitting
767      Gain.updateIfLessThan(
768          computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
769  
770      if (EnableChainSplitAlongJumps) {
771        // Attach (a part of) ChainPred before the first block of ChainSucc
772        for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
773          const auto SrcBlock = Jump->Source;
774          if (SrcBlock->CurChain != ChainPred)
775            continue;
776          size_t Offset = SrcBlock->CurIndex + 1;
777          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
778        }
779  
780        // Attach (a part of) ChainPred after the last block of ChainSucc
781        for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
782          const auto DstBlock = Jump->Source;
783          if (DstBlock->CurChain != ChainPred)
784            continue;
785          size_t Offset = DstBlock->CurIndex;
786          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
787        }
788      }
789  
790      // Try to break ChainPred in various ways and concatenate with ChainSucc
791      if (ChainPred->blocks().size() <= ChainSplitThreshold) {
792        for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
793          // Try to split the chain in different ways. In practice, applying
794          // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
795          // it from consideration to reduce the search space
796          tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
797                                   MergeTypeTy::X2_X1_Y});
798        }
799      }
800      Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
801      return Gain;
802    }
803  
804    /// Compute the score gain of merging two chains, respecting a given
805    /// merge 'type' and 'offset'.
806    ///
807    /// The two chains are not modified in the method.
808    MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
809                                 const std::vector<Jump *> &Jumps,
810                                 size_t MergeOffset,
811                                 MergeTypeTy MergeType) const {
812      auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
813                                      MergeOffset, MergeType);
814  
815      // Do not allow a merge that does not preserve the original entry block
816      if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
817          !MergedBlocks.getFirstBlock()->isEntry())
818        return MergeGainTy();
819  
820      // The gain for the new chain
821      auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
822      return MergeGainTy(NewGainScore, MergeOffset, MergeType);
823    }
824  
825    /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
826    ///
827    /// If MergeType == 0, then the result is a concatenation of two chains.
828    /// Otherwise, the first chain is cut into two sub-chains at the offset,
829    /// and merged using all possible ways of concatenating three chains.
830    MergedChain mergeBlocks(const std::vector<Block *> &X,
831                            const std::vector<Block *> &Y, size_t MergeOffset,
832                            MergeTypeTy MergeType) const {
833      // Split the first chain, X, into X1 and X2
834      BlockIter BeginX1 = X.begin();
835      BlockIter EndX1 = X.begin() + MergeOffset;
836      BlockIter BeginX2 = X.begin() + MergeOffset;
837      BlockIter EndX2 = X.end();
838      BlockIter BeginY = Y.begin();
839      BlockIter EndY = Y.end();
840  
841      // Construct a new chain from the three existing ones
842      switch (MergeType) {
843      case MergeTypeTy::X_Y:
844        return MergedChain(BeginX1, EndX2, BeginY, EndY);
845      case MergeTypeTy::X1_Y_X2:
846        return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
847      case MergeTypeTy::Y_X2_X1:
848        return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
849      case MergeTypeTy::X2_X1_Y:
850        return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
851      }
852      llvm_unreachable("unexpected chain merge type");
853    }
854  
855    /// Merge chain From into chain Into, update the list of active chains,
856    /// adjacency information, and the corresponding cached values.
857    void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
858                     MergeTypeTy MergeType) {
859      assert(Into != From && "a chain cannot be merged with itself");
860  
861      // Merge the blocks
862      MergedChain MergedBlocks =
863          mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
864      Into->merge(From, MergedBlocks.getBlocks());
865      Into->mergeEdges(From);
866      From->clear();
867  
868      // Update cached ext-tsp score for the new chain
869      ChainEdge *SelfEdge = Into->getEdge(Into);
870      if (SelfEdge != nullptr) {
871        MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
872        Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
873      }
874  
875      // Remove chain From from the list of active chains
876      llvm::erase_value(HotChains, From);
877  
878      // Invalidate caches
879      for (auto EdgeIter : Into->edges()) {
880        EdgeIter.second->invalidateCache();
881      }
882    }
883  
884    /// Concatenate all chains into a final order of blocks.
885    void concatChains(std::vector<uint64_t> &Order) {
886      // Collect chains and calculate some stats for their sorting
887      std::vector<Chain *> SortedChains;
888      DenseMap<const Chain *, double> ChainDensity;
889      for (auto &Chain : AllChains) {
890        if (!Chain.blocks().empty()) {
891          SortedChains.push_back(&Chain);
892          // Using doubles to avoid overflow of ExecutionCount
893          double Size = 0;
894          double ExecutionCount = 0;
895          for (auto *Block : Chain.blocks()) {
896            Size += static_cast<double>(Block->Size);
897            ExecutionCount += static_cast<double>(Block->ExecutionCount);
898          }
899          assert(Size > 0 && "a chain of zero size");
900          ChainDensity[&Chain] = ExecutionCount / Size;
901        }
902      }
903  
904      // Sorting chains by density in the decreasing order
905      std::stable_sort(SortedChains.begin(), SortedChains.end(),
906                       [&](const Chain *C1, const Chain *C2) {
907                         // Make sure the original entry block is at the
908                         // beginning of the order
909                         if (C1->isEntry() != C2->isEntry()) {
910                           return C1->isEntry();
911                         }
912  
913                         const double D1 = ChainDensity[C1];
914                         const double D2 = ChainDensity[C2];
915                         // Compare by density and break ties by chain identifiers
916                         return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
917                       });
918  
919      // Collect the blocks in the order specified by their chains
920      Order.reserve(NumNodes);
921      for (Chain *Chain : SortedChains) {
922        for (Block *Block : Chain->blocks()) {
923          Order.push_back(Block->Index);
924        }
925      }
926    }
927  
928  private:
929    /// The number of nodes in the graph.
930    const size_t NumNodes;
931  
932    /// Successors of each node.
933    std::vector<std::vector<uint64_t>> SuccNodes;
934  
935    /// Predecessors of each node.
936    std::vector<std::vector<uint64_t>> PredNodes;
937  
938    /// All basic blocks.
939    std::vector<Block> AllBlocks;
940  
941    /// All jumps between blocks.
942    std::vector<Jump> AllJumps;
943  
944    /// All chains of basic blocks.
945    std::vector<Chain> AllChains;
946  
947    /// All edges between chains.
948    std::vector<ChainEdge> AllEdges;
949  
950    /// Active chains. The vector gets updated at runtime when chains are merged.
951    std::vector<Chain *> HotChains;
952  };
953  
954  } // end of anonymous namespace
955  
956  std::vector<uint64_t> llvm::applyExtTspLayout(
957      const std::vector<uint64_t> &NodeSizes,
958      const std::vector<uint64_t> &NodeCounts,
959      const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
960    size_t NumNodes = NodeSizes.size();
961  
962    // Verify correctness of the input data.
963    assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
964    assert(NumNodes > 2 && "Incorrect input");
965  
966    // Apply the reordering algorithm.
967    auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
968    std::vector<uint64_t> Result;
969    Alg.run(Result);
970  
971    // Verify correctness of the output.
972    assert(Result.front() == 0 && "Original entry point is not preserved");
973    assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
974    return Result;
975  }
976  
977  double llvm::calcExtTspScore(
978      const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
979      const std::vector<uint64_t> &NodeCounts,
980      const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
981    // Estimate addresses of the blocks in memory
982    std::vector<uint64_t> Addr(NodeSizes.size(), 0);
983    for (size_t Idx = 1; Idx < Order.size(); Idx++) {
984      Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
985    }
986    std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
987    for (auto It : EdgeCounts) {
988      auto Pred = It.first.first;
989      OutDegree[Pred]++;
990    }
991  
992    // Increase the score for each jump
993    double Score = 0;
994    for (auto It : EdgeCounts) {
995      auto Pred = It.first.first;
996      auto Succ = It.first.second;
997      uint64_t Count = It.second;
998      bool IsConditional = OutDegree[Pred] > 1;
999      Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
1000                             IsConditional);
1001    }
1002    return Score;
1003  }
1004  
1005  double llvm::calcExtTspScore(
1006      const std::vector<uint64_t> &NodeSizes,
1007      const std::vector<uint64_t> &NodeCounts,
1008      const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
1009    std::vector<uint64_t> Order(NodeSizes.size());
1010    for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1011      Order[Idx] = Idx;
1012    }
1013    return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1014  }
1015