1 //===- CallGraphSort.cpp --------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// The file is responsible for sorting sections using LLVM call graph profile 10 /// data by placing frequently executed code sections together. The goal of the 11 /// placement is to improve the runtime performance of the final executable by 12 /// arranging code sections so that i-TLB misses and i-cache misses are reduced. 13 /// 14 /// The algorithm first builds a call graph based on the profile data and then 15 /// iteratively merges "chains" (ordered lists) of input sections which will be 16 /// laid out as a unit. There are two implementations for deciding how to 17 /// merge a pair of chains: 18 /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows 19 /// "Optimizing Function Placement for Large-Scale Data-Center Applications" 20 /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf 21 /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which 22 /// typically produces layouts with higher locality, and hence, yields fewer 23 /// instruction cache misses on large binaries. 24 //===----------------------------------------------------------------------===// 25 26 #include "CallGraphSort.h" 27 #include "InputFiles.h" 28 #include "InputSection.h" 29 #include "Symbols.h" 30 #include "llvm/Support/FileSystem.h" 31 #include "llvm/Transforms/Utils/CodeLayout.h" 32 33 #include <numeric> 34 35 using namespace llvm; 36 using namespace lld; 37 using namespace lld::elf; 38 39 namespace { 40 struct Edge { 41 int from; 42 uint64_t weight; 43 }; 44 45 struct Cluster { 46 Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} 47 48 double getDensity() const { 49 if (size == 0) 50 return 0; 51 return double(weight) / double(size); 52 } 53 54 int next; 55 int prev; 56 uint64_t size; 57 uint64_t weight = 0; 58 uint64_t initialWeight = 0; 59 Edge bestPred = {-1, 0}; 60 }; 61 62 /// Implementation of the Call-Chain Clustering (C^3). The goal of this 63 /// algorithm is to improve runtime performance of the executable by arranging 64 /// code sections such that page table and i-cache misses are minimized. 65 /// 66 /// Definitions: 67 /// * Cluster 68 /// * An ordered list of input sections which are laid out as a unit. At the 69 /// beginning of the algorithm each input section has its own cluster and 70 /// the weight of the cluster is the sum of the weight of all incoming 71 /// edges. 72 /// * Call-Chain Clustering (C³) Heuristic 73 /// * Defines when and how clusters are combined. Pick the highest weighted 74 /// input section then add it to its most likely predecessor if it wouldn't 75 /// penalize it too much. 76 /// * Density 77 /// * The weight of the cluster divided by the size of the cluster. This is a 78 /// proxy for the amount of execution time spent per byte of the cluster. 79 /// 80 /// It does so given a call graph profile by the following: 81 /// * Build a weighted call graph from the call graph profile 82 /// * Sort input sections by weight 83 /// * For each input section starting with the highest weight 84 /// * Find its most likely predecessor cluster 85 /// * Check if the combined cluster would be too large, or would have too low 86 /// a density. 87 /// * If not, then combine the clusters. 88 /// * Sort non-empty clusters by density 89 class CallGraphSort { 90 public: 91 CallGraphSort(); 92 93 DenseMap<const InputSectionBase *, int> run(); 94 95 private: 96 std::vector<Cluster> clusters; 97 std::vector<const InputSectionBase *> sections; 98 }; 99 100 // Maximum amount the combined cluster density can be worse than the original 101 // cluster to consider merging. 102 constexpr int MAX_DENSITY_DEGRADATION = 8; 103 104 // Maximum cluster size in bytes. 105 constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; 106 } // end anonymous namespace 107 108 using SectionPair = 109 std::pair<const InputSectionBase *, const InputSectionBase *>; 110 111 // Take the edge list in Config->CallGraphProfile, resolve symbol names to 112 // Symbols, and generate a graph between InputSections with the provided 113 // weights. 114 CallGraphSort::CallGraphSort() { 115 MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile; 116 DenseMap<const InputSectionBase *, int> secToCluster; 117 118 auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { 119 auto res = secToCluster.try_emplace(isec, clusters.size()); 120 if (res.second) { 121 sections.push_back(isec); 122 clusters.emplace_back(clusters.size(), isec->getSize()); 123 } 124 return res.first->second; 125 }; 126 127 // Create the graph. 128 for (std::pair<SectionPair, uint64_t> &c : profile) { 129 const auto *fromSB = cast<InputSectionBase>(c.first.first); 130 const auto *toSB = cast<InputSectionBase>(c.first.second); 131 uint64_t weight = c.second; 132 133 // Ignore edges between input sections belonging to different output 134 // sections. This is done because otherwise we would end up with clusters 135 // containing input sections that can't actually be placed adjacently in the 136 // output. This messes with the cluster size and density calculations. We 137 // would also end up moving input sections in other output sections without 138 // moving them closer to what calls them. 139 if (fromSB->getOutputSection() != toSB->getOutputSection()) 140 continue; 141 142 int from = getOrCreateNode(fromSB); 143 int to = getOrCreateNode(toSB); 144 145 clusters[to].weight += weight; 146 147 if (from == to) 148 continue; 149 150 // Remember the best edge. 151 Cluster &toC = clusters[to]; 152 if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { 153 toC.bestPred.from = from; 154 toC.bestPred.weight = weight; 155 } 156 } 157 for (Cluster &c : clusters) 158 c.initialWeight = c.weight; 159 } 160 161 // It's bad to merge clusters which would degrade the density too much. 162 static bool isNewDensityBad(Cluster &a, Cluster &b) { 163 double newDensity = double(a.weight + b.weight) / double(a.size + b.size); 164 return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; 165 } 166 167 // Find the leader of V's belonged cluster (represented as an equivalence 168 // class). We apply union-find path-halving technique (simple to implement) in 169 // the meantime as it decreases depths and the time complexity. 170 static int getLeader(int *leaders, int v) { 171 while (leaders[v] != v) { 172 leaders[v] = leaders[leaders[v]]; 173 v = leaders[v]; 174 } 175 return v; 176 } 177 178 static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, 179 Cluster &from, int fromIdx) { 180 int tail1 = into.prev, tail2 = from.prev; 181 into.prev = tail2; 182 cs[tail2].next = intoIdx; 183 from.prev = tail1; 184 cs[tail1].next = fromIdx; 185 into.size += from.size; 186 into.weight += from.weight; 187 from.size = 0; 188 from.weight = 0; 189 } 190 191 // Group InputSections into clusters using the Call-Chain Clustering heuristic 192 // then sort the clusters by density. 193 DenseMap<const InputSectionBase *, int> CallGraphSort::run() { 194 std::vector<int> sorted(clusters.size()); 195 std::unique_ptr<int[]> leaders(new int[clusters.size()]); 196 197 std::iota(leaders.get(), leaders.get() + clusters.size(), 0); 198 std::iota(sorted.begin(), sorted.end(), 0); 199 llvm::stable_sort(sorted, [&](int a, int b) { 200 return clusters[a].getDensity() > clusters[b].getDensity(); 201 }); 202 203 for (int l : sorted) { 204 // The cluster index is the same as the index of its leader here because 205 // clusters[L] has not been merged into another cluster yet. 206 Cluster &c = clusters[l]; 207 208 // Don't consider merging if the edge is unlikely. 209 if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) 210 continue; 211 212 int predL = getLeader(leaders.get(), c.bestPred.from); 213 if (l == predL) 214 continue; 215 216 Cluster *predC = &clusters[predL]; 217 if (c.size + predC->size > MAX_CLUSTER_SIZE) 218 continue; 219 220 if (isNewDensityBad(*predC, c)) 221 continue; 222 223 leaders[l] = predL; 224 mergeClusters(clusters, *predC, predL, c, l); 225 } 226 227 // Sort remaining non-empty clusters by density. 228 sorted.clear(); 229 for (int i = 0, e = (int)clusters.size(); i != e; ++i) 230 if (clusters[i].size > 0) 231 sorted.push_back(i); 232 llvm::stable_sort(sorted, [&](int a, int b) { 233 return clusters[a].getDensity() > clusters[b].getDensity(); 234 }); 235 236 DenseMap<const InputSectionBase *, int> orderMap; 237 int curOrder = 1; 238 for (int leader : sorted) { 239 for (int i = leader;;) { 240 orderMap[sections[i]] = curOrder++; 241 i = clusters[i].next; 242 if (i == leader) 243 break; 244 } 245 } 246 if (!config->printSymbolOrder.empty()) { 247 std::error_code ec; 248 raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); 249 if (ec) { 250 error("cannot open " + config->printSymbolOrder + ": " + ec.message()); 251 return orderMap; 252 } 253 254 // Print the symbols ordered by C3, in the order of increasing curOrder 255 // Instead of sorting all the orderMap, just repeat the loops above. 256 for (int leader : sorted) 257 for (int i = leader;;) { 258 // Search all the symbols in the file of the section 259 // and find out a Defined symbol with name that is within the section. 260 for (Symbol *sym : sections[i]->file->getSymbols()) 261 if (!sym->isSection()) // Filter out section-type symbols here. 262 if (auto *d = dyn_cast<Defined>(sym)) 263 if (sections[i] == d->section) 264 os << sym->getName() << "\n"; 265 i = clusters[i].next; 266 if (i == leader) 267 break; 268 } 269 } 270 271 return orderMap; 272 } 273 274 // Sort sections by the profile data using the Cache-Directed Sort algorithm. 275 // The placement is done by optimizing the locality by co-locating frequently 276 // executed code sections together. 277 DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() { 278 SmallVector<uint64_t, 0> funcSizes; 279 SmallVector<uint64_t, 0> funcCounts; 280 SmallVector<codelayout::EdgeCount, 0> callCounts; 281 SmallVector<uint64_t, 0> callOffsets; 282 SmallVector<const InputSectionBase *, 0> sections; 283 DenseMap<const InputSectionBase *, size_t> secToTargetId; 284 285 auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { 286 auto res = secToTargetId.try_emplace(inSec, sections.size()); 287 if (res.second) { 288 // inSec does not appear before in the graph. 289 sections.push_back(inSec); 290 funcSizes.push_back(inSec->getSize()); 291 funcCounts.push_back(0); 292 } 293 return res.first->second; 294 }; 295 296 // Create the graph. 297 for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) { 298 const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first); 299 const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second); 300 // Ignore edges between input sections belonging to different sections. 301 if (fromSB->getOutputSection() != toSB->getOutputSection()) 302 continue; 303 304 uint64_t weight = c.second; 305 // Ignore edges with zero weight. 306 if (weight == 0) 307 continue; 308 309 size_t from = getOrCreateNode(fromSB); 310 size_t to = getOrCreateNode(toSB); 311 // Ignore self-edges (recursive calls). 312 if (from == to) 313 continue; 314 315 callCounts.push_back({from, to, weight}); 316 // Assume that the jump is at the middle of the input section. The profile 317 // data does not contain jump offsets. 318 callOffsets.push_back((funcSizes[from] + 1) / 2); 319 funcCounts[to] += weight; 320 } 321 322 // Run the layout algorithm. 323 std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( 324 funcSizes, funcCounts, callCounts, callOffsets); 325 326 // Create the final order. 327 DenseMap<const InputSectionBase *, int> orderMap; 328 int curOrder = 1; 329 for (uint64_t secIdx : sortedSections) 330 orderMap[sections[secIdx]] = curOrder++; 331 332 return orderMap; 333 } 334 335 // Sort sections by the profile data provided by --callgraph-profile-file. 336 // 337 // This first builds a call graph based on the profile data then merges sections 338 // according either to the C³ or Cache-Directed-Sort ordering algorithm. 339 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { 340 if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) 341 return computeCacheDirectedSortOrder(); 342 return CallGraphSort().run(); 343 } 344