10b57cec5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// 9*5f757f3fSDimitry Andric /// The file is responsible for sorting sections using LLVM call graph profile 10*5f757f3fSDimitry Andric /// data by placing frequently executed code sections together. The goal of the 11*5f757f3fSDimitry Andric /// placement is to improve the runtime performance of the final executable by 12*5f757f3fSDimitry Andric /// arranging code sections so that i-TLB misses and i-cache misses are reduced. 13*5f757f3fSDimitry Andric /// 14*5f757f3fSDimitry Andric /// The algorithm first builds a call graph based on the profile data and then 15*5f757f3fSDimitry Andric /// iteratively merges "chains" (ordered lists) of input sections which will be 16*5f757f3fSDimitry Andric /// laid out as a unit. There are two implementations for deciding how to 17*5f757f3fSDimitry Andric /// merge a pair of chains: 18*5f757f3fSDimitry Andric /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows 19*5f757f3fSDimitry Andric /// "Optimizing Function Placement for Large-Scale Data-Center Applications" 200b57cec5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf 21*5f757f3fSDimitry Andric /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which 22*5f757f3fSDimitry Andric /// typically produces layouts with higher locality, and hence, yields fewer 23*5f757f3fSDimitry Andric /// instruction cache misses on large binaries. 240b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 250b57cec5SDimitry Andric 260b57cec5SDimitry Andric #include "CallGraphSort.h" 2781ad6265SDimitry Andric #include "InputFiles.h" 2881ad6265SDimitry Andric #include "InputSection.h" 290b57cec5SDimitry Andric #include "Symbols.h" 3081ad6265SDimitry Andric #include "llvm/Support/FileSystem.h" 31*5f757f3fSDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h" 320b57cec5SDimitry Andric 3385868e8aSDimitry Andric #include <numeric> 3485868e8aSDimitry Andric 350b57cec5SDimitry Andric using namespace llvm; 365ffd83dbSDimitry Andric using namespace lld; 375ffd83dbSDimitry Andric using namespace lld::elf; 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric namespace { 400b57cec5SDimitry Andric struct Edge { 410b57cec5SDimitry Andric int from; 420b57cec5SDimitry Andric uint64_t weight; 430b57cec5SDimitry Andric }; 440b57cec5SDimitry Andric 450b57cec5SDimitry Andric struct Cluster { 4685868e8aSDimitry Andric Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} 470b57cec5SDimitry Andric 480b57cec5SDimitry Andric double getDensity() const { 490b57cec5SDimitry Andric if (size == 0) 500b57cec5SDimitry Andric return 0; 510b57cec5SDimitry Andric return double(weight) / double(size); 520b57cec5SDimitry Andric } 530b57cec5SDimitry Andric 5485868e8aSDimitry Andric int next; 5585868e8aSDimitry Andric int prev; 56e8d8bef9SDimitry Andric uint64_t size; 570b57cec5SDimitry Andric uint64_t weight = 0; 580b57cec5SDimitry Andric uint64_t initialWeight = 0; 590b57cec5SDimitry Andric Edge bestPred = {-1, 0}; 600b57cec5SDimitry Andric }; 610b57cec5SDimitry Andric 62*5f757f3fSDimitry Andric /// Implementation of the Call-Chain Clustering (C^3). The goal of this 63*5f757f3fSDimitry Andric /// algorithm is to improve runtime performance of the executable by arranging 64*5f757f3fSDimitry Andric /// code sections such that page table and i-cache misses are minimized. 65*5f757f3fSDimitry Andric /// 66*5f757f3fSDimitry Andric /// Definitions: 67*5f757f3fSDimitry Andric /// * Cluster 68*5f757f3fSDimitry Andric /// * An ordered list of input sections which are laid out as a unit. At the 69*5f757f3fSDimitry Andric /// beginning of the algorithm each input section has its own cluster and 70*5f757f3fSDimitry Andric /// the weight of the cluster is the sum of the weight of all incoming 71*5f757f3fSDimitry Andric /// edges. 72*5f757f3fSDimitry Andric /// * Call-Chain Clustering (C³) Heuristic 73*5f757f3fSDimitry Andric /// * Defines when and how clusters are combined. Pick the highest weighted 74*5f757f3fSDimitry Andric /// input section then add it to its most likely predecessor if it wouldn't 75*5f757f3fSDimitry Andric /// penalize it too much. 76*5f757f3fSDimitry Andric /// * Density 77*5f757f3fSDimitry Andric /// * The weight of the cluster divided by the size of the cluster. This is a 78*5f757f3fSDimitry Andric /// proxy for the amount of execution time spent per byte of the cluster. 79*5f757f3fSDimitry Andric /// 80*5f757f3fSDimitry Andric /// It does so given a call graph profile by the following: 81*5f757f3fSDimitry Andric /// * Build a weighted call graph from the call graph profile 82*5f757f3fSDimitry Andric /// * Sort input sections by weight 83*5f757f3fSDimitry Andric /// * For each input section starting with the highest weight 84*5f757f3fSDimitry Andric /// * Find its most likely predecessor cluster 85*5f757f3fSDimitry Andric /// * Check if the combined cluster would be too large, or would have too low 86*5f757f3fSDimitry Andric /// a density. 87*5f757f3fSDimitry Andric /// * If not, then combine the clusters. 88*5f757f3fSDimitry Andric /// * Sort non-empty clusters by density 890b57cec5SDimitry Andric class CallGraphSort { 900b57cec5SDimitry Andric public: 910b57cec5SDimitry Andric CallGraphSort(); 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> run(); 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric private: 960b57cec5SDimitry Andric std::vector<Cluster> clusters; 970b57cec5SDimitry Andric std::vector<const InputSectionBase *> sections; 980b57cec5SDimitry Andric }; 990b57cec5SDimitry Andric 100480093f4SDimitry Andric // Maximum amount the combined cluster density can be worse than the original 1010b57cec5SDimitry Andric // cluster to consider merging. 1020b57cec5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8; 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric // Maximum cluster size in bytes. 1050b57cec5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; 1060b57cec5SDimitry Andric } // end anonymous namespace 1070b57cec5SDimitry Andric 1080b57cec5SDimitry Andric using SectionPair = 1090b57cec5SDimitry Andric std::pair<const InputSectionBase *, const InputSectionBase *>; 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to 1120b57cec5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided 1130b57cec5SDimitry Andric // weights. 1140b57cec5SDimitry Andric CallGraphSort::CallGraphSort() { 1150b57cec5SDimitry Andric MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile; 1160b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> secToCluster; 1170b57cec5SDimitry Andric 1180b57cec5SDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { 11985868e8aSDimitry Andric auto res = secToCluster.try_emplace(isec, clusters.size()); 1200b57cec5SDimitry Andric if (res.second) { 1210b57cec5SDimitry Andric sections.push_back(isec); 1220b57cec5SDimitry Andric clusters.emplace_back(clusters.size(), isec->getSize()); 1230b57cec5SDimitry Andric } 1240b57cec5SDimitry Andric return res.first->second; 1250b57cec5SDimitry Andric }; 1260b57cec5SDimitry Andric 1270b57cec5SDimitry Andric // Create the graph. 1280b57cec5SDimitry Andric for (std::pair<SectionPair, uint64_t> &c : profile) { 1290eae32dcSDimitry Andric const auto *fromSB = cast<InputSectionBase>(c.first.first); 1300eae32dcSDimitry Andric const auto *toSB = cast<InputSectionBase>(c.first.second); 1310b57cec5SDimitry Andric uint64_t weight = c.second; 1320b57cec5SDimitry Andric 1330b57cec5SDimitry Andric // Ignore edges between input sections belonging to different output 1340b57cec5SDimitry Andric // sections. This is done because otherwise we would end up with clusters 1350b57cec5SDimitry Andric // containing input sections that can't actually be placed adjacently in the 1360b57cec5SDimitry Andric // output. This messes with the cluster size and density calculations. We 1370b57cec5SDimitry Andric // would also end up moving input sections in other output sections without 1380b57cec5SDimitry Andric // moving them closer to what calls them. 1390b57cec5SDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection()) 1400b57cec5SDimitry Andric continue; 1410b57cec5SDimitry Andric 1420b57cec5SDimitry Andric int from = getOrCreateNode(fromSB); 1430b57cec5SDimitry Andric int to = getOrCreateNode(toSB); 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric clusters[to].weight += weight; 1460b57cec5SDimitry Andric 1470b57cec5SDimitry Andric if (from == to) 1480b57cec5SDimitry Andric continue; 1490b57cec5SDimitry Andric 1500b57cec5SDimitry Andric // Remember the best edge. 1510b57cec5SDimitry Andric Cluster &toC = clusters[to]; 1520b57cec5SDimitry Andric if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { 1530b57cec5SDimitry Andric toC.bestPred.from = from; 1540b57cec5SDimitry Andric toC.bestPred.weight = weight; 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric } 1570b57cec5SDimitry Andric for (Cluster &c : clusters) 1580b57cec5SDimitry Andric c.initialWeight = c.weight; 1590b57cec5SDimitry Andric } 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric // It's bad to merge clusters which would degrade the density too much. 1620b57cec5SDimitry Andric static bool isNewDensityBad(Cluster &a, Cluster &b) { 1630b57cec5SDimitry Andric double newDensity = double(a.weight + b.weight) / double(a.size + b.size); 1640b57cec5SDimitry Andric return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; 1650b57cec5SDimitry Andric } 1660b57cec5SDimitry Andric 16785868e8aSDimitry Andric // Find the leader of V's belonged cluster (represented as an equivalence 16885868e8aSDimitry Andric // class). We apply union-find path-halving technique (simple to implement) in 16985868e8aSDimitry Andric // the meantime as it decreases depths and the time complexity. 170bdd1243dSDimitry Andric static int getLeader(int *leaders, int v) { 17185868e8aSDimitry Andric while (leaders[v] != v) { 17285868e8aSDimitry Andric leaders[v] = leaders[leaders[v]]; 17385868e8aSDimitry Andric v = leaders[v]; 17485868e8aSDimitry Andric } 17585868e8aSDimitry Andric return v; 17685868e8aSDimitry Andric } 17785868e8aSDimitry Andric 17885868e8aSDimitry Andric static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, 17985868e8aSDimitry Andric Cluster &from, int fromIdx) { 18085868e8aSDimitry Andric int tail1 = into.prev, tail2 = from.prev; 18185868e8aSDimitry Andric into.prev = tail2; 18285868e8aSDimitry Andric cs[tail2].next = intoIdx; 18385868e8aSDimitry Andric from.prev = tail1; 18485868e8aSDimitry Andric cs[tail1].next = fromIdx; 1850b57cec5SDimitry Andric into.size += from.size; 1860b57cec5SDimitry Andric into.weight += from.weight; 1870b57cec5SDimitry Andric from.size = 0; 1880b57cec5SDimitry Andric from.weight = 0; 1890b57cec5SDimitry Andric } 1900b57cec5SDimitry Andric 1910b57cec5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic 1920b57cec5SDimitry Andric // then sort the clusters by density. 19385868e8aSDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() { 19485868e8aSDimitry Andric std::vector<int> sorted(clusters.size()); 195bdd1243dSDimitry Andric std::unique_ptr<int[]> leaders(new int[clusters.size()]); 1960b57cec5SDimitry Andric 197bdd1243dSDimitry Andric std::iota(leaders.get(), leaders.get() + clusters.size(), 0); 19885868e8aSDimitry Andric std::iota(sorted.begin(), sorted.end(), 0); 19985868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) { 2000b57cec5SDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity(); 2010b57cec5SDimitry Andric }); 2020b57cec5SDimitry Andric 20385868e8aSDimitry Andric for (int l : sorted) { 20485868e8aSDimitry Andric // The cluster index is the same as the index of its leader here because 20585868e8aSDimitry Andric // clusters[L] has not been merged into another cluster yet. 20685868e8aSDimitry Andric Cluster &c = clusters[l]; 2070b57cec5SDimitry Andric 2080b57cec5SDimitry Andric // Don't consider merging if the edge is unlikely. 2090b57cec5SDimitry Andric if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) 2100b57cec5SDimitry Andric continue; 2110b57cec5SDimitry Andric 212bdd1243dSDimitry Andric int predL = getLeader(leaders.get(), c.bestPred.from); 21385868e8aSDimitry Andric if (l == predL) 2140b57cec5SDimitry Andric continue; 2150b57cec5SDimitry Andric 21685868e8aSDimitry Andric Cluster *predC = &clusters[predL]; 2170b57cec5SDimitry Andric if (c.size + predC->size > MAX_CLUSTER_SIZE) 2180b57cec5SDimitry Andric continue; 2190b57cec5SDimitry Andric 2200b57cec5SDimitry Andric if (isNewDensityBad(*predC, c)) 2210b57cec5SDimitry Andric continue; 2220b57cec5SDimitry Andric 22385868e8aSDimitry Andric leaders[l] = predL; 22485868e8aSDimitry Andric mergeClusters(clusters, *predC, predL, c, l); 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric 22785868e8aSDimitry Andric // Sort remaining non-empty clusters by density. 22885868e8aSDimitry Andric sorted.clear(); 22985868e8aSDimitry Andric for (int i = 0, e = (int)clusters.size(); i != e; ++i) 23085868e8aSDimitry Andric if (clusters[i].size > 0) 23185868e8aSDimitry Andric sorted.push_back(i); 23285868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) { 23385868e8aSDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity(); 2340b57cec5SDimitry Andric }); 2350b57cec5SDimitry Andric 2360b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> orderMap; 23785868e8aSDimitry Andric int curOrder = 1; 238e8d8bef9SDimitry Andric for (int leader : sorted) { 23985868e8aSDimitry Andric for (int i = leader;;) { 24085868e8aSDimitry Andric orderMap[sections[i]] = curOrder++; 24185868e8aSDimitry Andric i = clusters[i].next; 24285868e8aSDimitry Andric if (i == leader) 24385868e8aSDimitry Andric break; 24485868e8aSDimitry Andric } 245e8d8bef9SDimitry Andric } 2460b57cec5SDimitry Andric if (!config->printSymbolOrder.empty()) { 2470b57cec5SDimitry Andric std::error_code ec; 24885868e8aSDimitry Andric raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); 2490b57cec5SDimitry Andric if (ec) { 2500b57cec5SDimitry Andric error("cannot open " + config->printSymbolOrder + ": " + ec.message()); 2510b57cec5SDimitry Andric return orderMap; 2520b57cec5SDimitry Andric } 2530b57cec5SDimitry Andric 2540b57cec5SDimitry Andric // Print the symbols ordered by C3, in the order of increasing curOrder 2550b57cec5SDimitry Andric // Instead of sorting all the orderMap, just repeat the loops above. 25685868e8aSDimitry Andric for (int leader : sorted) 25785868e8aSDimitry Andric for (int i = leader;;) { 2580b57cec5SDimitry Andric // Search all the symbols in the file of the section 2590b57cec5SDimitry Andric // and find out a Defined symbol with name that is within the section. 26085868e8aSDimitry Andric for (Symbol *sym : sections[i]->file->getSymbols()) 2610b57cec5SDimitry Andric if (!sym->isSection()) // Filter out section-type symbols here. 2620b57cec5SDimitry Andric if (auto *d = dyn_cast<Defined>(sym)) 26385868e8aSDimitry Andric if (sections[i] == d->section) 2640b57cec5SDimitry Andric os << sym->getName() << "\n"; 26585868e8aSDimitry Andric i = clusters[i].next; 26685868e8aSDimitry Andric if (i == leader) 26785868e8aSDimitry Andric break; 26885868e8aSDimitry Andric } 2690b57cec5SDimitry Andric } 2700b57cec5SDimitry Andric 2710b57cec5SDimitry Andric return orderMap; 2720b57cec5SDimitry Andric } 2730b57cec5SDimitry Andric 274*5f757f3fSDimitry Andric // Sort sections by the profile data using the Cache-Directed Sort algorithm. 275*5f757f3fSDimitry Andric // The placement is done by optimizing the locality by co-locating frequently 276*5f757f3fSDimitry Andric // executed code sections together. 277*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() { 278*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> funcSizes; 279*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> funcCounts; 280*5f757f3fSDimitry Andric SmallVector<codelayout::EdgeCount, 0> callCounts; 281*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> callOffsets; 282*5f757f3fSDimitry Andric SmallVector<const InputSectionBase *, 0> sections; 283*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, size_t> secToTargetId; 284*5f757f3fSDimitry Andric 285*5f757f3fSDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { 286*5f757f3fSDimitry Andric auto res = secToTargetId.try_emplace(inSec, sections.size()); 287*5f757f3fSDimitry Andric if (res.second) { 288*5f757f3fSDimitry Andric // inSec does not appear before in the graph. 289*5f757f3fSDimitry Andric sections.push_back(inSec); 290*5f757f3fSDimitry Andric funcSizes.push_back(inSec->getSize()); 291*5f757f3fSDimitry Andric funcCounts.push_back(0); 292*5f757f3fSDimitry Andric } 293*5f757f3fSDimitry Andric return res.first->second; 294*5f757f3fSDimitry Andric }; 295*5f757f3fSDimitry Andric 296*5f757f3fSDimitry Andric // Create the graph. 297*5f757f3fSDimitry Andric for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) { 298*5f757f3fSDimitry Andric const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first); 299*5f757f3fSDimitry Andric const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second); 300*5f757f3fSDimitry Andric // Ignore edges between input sections belonging to different sections. 301*5f757f3fSDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection()) 302*5f757f3fSDimitry Andric continue; 303*5f757f3fSDimitry Andric 304*5f757f3fSDimitry Andric uint64_t weight = c.second; 305*5f757f3fSDimitry Andric // Ignore edges with zero weight. 306*5f757f3fSDimitry Andric if (weight == 0) 307*5f757f3fSDimitry Andric continue; 308*5f757f3fSDimitry Andric 309*5f757f3fSDimitry Andric size_t from = getOrCreateNode(fromSB); 310*5f757f3fSDimitry Andric size_t to = getOrCreateNode(toSB); 311*5f757f3fSDimitry Andric // Ignore self-edges (recursive calls). 312*5f757f3fSDimitry Andric if (from == to) 313*5f757f3fSDimitry Andric continue; 314*5f757f3fSDimitry Andric 315*5f757f3fSDimitry Andric callCounts.push_back({from, to, weight}); 316*5f757f3fSDimitry Andric // Assume that the jump is at the middle of the input section. The profile 317*5f757f3fSDimitry Andric // data does not contain jump offsets. 318*5f757f3fSDimitry Andric callOffsets.push_back((funcSizes[from] + 1) / 2); 319*5f757f3fSDimitry Andric funcCounts[to] += weight; 320*5f757f3fSDimitry Andric } 321*5f757f3fSDimitry Andric 322*5f757f3fSDimitry Andric // Run the layout algorithm. 323*5f757f3fSDimitry Andric std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( 324*5f757f3fSDimitry Andric funcSizes, funcCounts, callCounts, callOffsets); 325*5f757f3fSDimitry Andric 326*5f757f3fSDimitry Andric // Create the final order. 327*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, int> orderMap; 328*5f757f3fSDimitry Andric int curOrder = 1; 329*5f757f3fSDimitry Andric for (uint64_t secIdx : sortedSections) 330*5f757f3fSDimitry Andric orderMap[sections[secIdx]] = curOrder++; 331*5f757f3fSDimitry Andric 332*5f757f3fSDimitry Andric return orderMap; 333*5f757f3fSDimitry Andric } 334*5f757f3fSDimitry Andric 335349cc55cSDimitry Andric // Sort sections by the profile data provided by --callgraph-profile-file. 3360b57cec5SDimitry Andric // 3370b57cec5SDimitry Andric // This first builds a call graph based on the profile data then merges sections 338*5f757f3fSDimitry Andric // according either to the C³ or Cache-Directed-Sort ordering algorithm. 3395ffd83dbSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { 340*5f757f3fSDimitry Andric if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) 341*5f757f3fSDimitry Andric return computeCacheDirectedSortOrder(); 3420b57cec5SDimitry Andric return CallGraphSort().run(); 3430b57cec5SDimitry Andric } 344