1 //===- ConcatOutputSection.cpp --------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ConcatOutputSection.h" 10 #include "Config.h" 11 #include "OutputSegment.h" 12 #include "SymbolTable.h" 13 #include "Symbols.h" 14 #include "SyntheticSections.h" 15 #include "Target.h" 16 #include "lld/Common/CommonLinkerContext.h" 17 #include "llvm/BinaryFormat/MachO.h" 18 19 using namespace llvm; 20 using namespace llvm::MachO; 21 using namespace lld; 22 using namespace lld::macho; 23 24 MapVector<NamePair, ConcatOutputSection *> macho::concatOutputSections; 25 26 void ConcatOutputSection::addInput(ConcatInputSection *input) { 27 assert(input->parent == this); 28 if (inputs.empty()) { 29 align = input->align; 30 flags = input->getFlags(); 31 } else { 32 align = std::max(align, input->align); 33 finalizeFlags(input); 34 } 35 inputs.push_back(input); 36 } 37 38 // Branch-range extension can be implemented in two ways, either through ... 39 // 40 // (1) Branch islands: Single branch instructions (also of limited range), 41 // that might be chained in multiple hops to reach the desired 42 // destination. On ARM64, as 16 branch islands are needed to hop between 43 // opposite ends of a 2 GiB program. LD64 uses branch islands exclusively, 44 // even when it needs excessive hops. 45 // 46 // (2) Thunks: Instruction(s) to load the destination address into a scratch 47 // register, followed by a register-indirect branch. Thunks are 48 // constructed to reach any arbitrary address, so need not be 49 // chained. Although thunks need not be chained, a program might need 50 // multiple thunks to the same destination distributed throughout a large 51 // program so that all call sites can have one within range. 52 // 53 // The optimal approach is to mix islands for destinations within two hops, 54 // and use thunks for destinations at greater distance. For now, we only 55 // implement thunks. TODO: Adding support for branch islands! 56 // 57 // Internally -- as expressed in LLD's data structures -- a 58 // branch-range-extension thunk consists of: 59 // 60 // (1) new Defined symbol for the thunk named 61 // <FUNCTION>.thunk.<SEQUENCE>, which references ... 62 // (2) new InputSection, which contains ... 63 // (3.1) new data for the instructions to load & branch to the far address + 64 // (3.2) new Relocs on instructions to load the far address, which reference ... 65 // (4.1) existing Defined symbol for the real function in __text, or 66 // (4.2) existing DylibSymbol for the real function in a dylib 67 // 68 // Nearly-optimal thunk-placement algorithm features: 69 // 70 // * Single pass: O(n) on the number of call sites. 71 // 72 // * Accounts for the exact space overhead of thunks - no heuristics 73 // 74 // * Exploits the full range of call instructions - forward & backward 75 // 76 // Data: 77 // 78 // * DenseMap<Symbol *, ThunkInfo> thunkMap: Maps the function symbol 79 // to its thunk bookkeeper. 80 // 81 // * struct ThunkInfo (bookkeeper): Call instructions have limited range, and 82 // distant call sites might be unable to reach the same thunk, so multiple 83 // thunks are necessary to serve all call sites in a very large program. A 84 // thunkInfo stores state for all thunks associated with a particular 85 // function: 86 // (a) thunk symbol 87 // (b) input section containing stub code, and 88 // (c) sequence number for the active thunk incarnation. 89 // When an old thunk goes out of range, we increment the sequence number and 90 // create a new thunk named <FUNCTION>.thunk.<SEQUENCE>. 91 // 92 // * A thunk consists of 93 // (a) a Defined symbol pointing to 94 // (b) an InputSection holding machine code (similar to a MachO stub), and 95 // (c) relocs referencing the real function for fixing up the stub code. 96 // 97 // * std::vector<InputSection *> MergedInputSection::thunks: A vector parallel 98 // to the inputs vector. We store new thunks via cheap vector append, rather 99 // than costly insertion into the inputs vector. 100 // 101 // Control Flow: 102 // 103 // * During address assignment, MergedInputSection::finalize() examines call 104 // sites by ascending address and creates thunks. When a function is beyond 105 // the range of a call site, we need a thunk. Place it at the largest 106 // available forward address from the call site. Call sites increase 107 // monotonically and thunks are always placed as far forward as possible; 108 // thus, we place thunks at monotonically increasing addresses. Once a thunk 109 // is placed, it and all previous input-section addresses are final. 110 // 111 // * ConcatInputSection::finalize() and ConcatInputSection::writeTo() merge 112 // the inputs and thunks vectors (both ordered by ascending address), which 113 // is simple and cheap. 114 115 DenseMap<Symbol *, ThunkInfo> lld::macho::thunkMap; 116 117 // Determine whether we need thunks, which depends on the target arch -- RISC 118 // (i.e., ARM) generally does because it has limited-range branch/call 119 // instructions, whereas CISC (i.e., x86) generally doesn't. RISC only needs 120 // thunks for programs so large that branch source & destination addresses 121 // might differ more than the range of branch instruction(s). 122 bool TextOutputSection::needsThunks() const { 123 if (!target->usesThunks()) 124 return false; 125 uint64_t isecAddr = addr; 126 for (ConcatInputSection *isec : inputs) 127 isecAddr = alignToPowerOf2(isecAddr, isec->align) + isec->getSize(); 128 // Other sections besides __text might be small enough to pass this 129 // test but nevertheless need thunks for calling into other sections. 130 // An imperfect heuristic to use in this case is that if a section 131 // we've already processed in this segment needs thunks, so do the 132 // rest. 133 bool needsThunks = parent && parent->needsThunks; 134 135 // Calculate the total size of all branch target sections 136 uint64_t branchTargetsSize = in.stubs->getSize(); 137 138 // Add the size of __objc_stubs section if it exists 139 if (in.objcStubs && in.objcStubs->isNeeded()) 140 branchTargetsSize += in.objcStubs->getSize(); 141 142 if (!needsThunks && 143 isecAddr - addr + branchTargetsSize <= 144 std::min(target->backwardBranchRange, target->forwardBranchRange)) 145 return false; 146 // Yes, this program is large enough to need thunks. 147 if (parent) { 148 parent->needsThunks = true; 149 } 150 for (ConcatInputSection *isec : inputs) { 151 for (Reloc &r : isec->relocs) { 152 if (!target->hasAttr(r.type, RelocAttrBits::BRANCH)) 153 continue; 154 auto *sym = cast<Symbol *>(r.referent); 155 // Pre-populate the thunkMap and memoize call site counts for every 156 // InputSection and ThunkInfo. We do this for the benefit of 157 // estimateBranchTargetThresholdVA(). 158 ThunkInfo &thunkInfo = thunkMap[sym]; 159 // Knowing ThunkInfo call site count will help us know whether or not we 160 // might need to create more for this referent at the time we are 161 // estimating distance to __stubs in estimateBranchTargetThresholdVA(). 162 ++thunkInfo.callSiteCount; 163 // We can avoid work on InputSections that have no BRANCH relocs. 164 isec->hasCallSites = true; 165 } 166 } 167 return true; 168 } 169 170 // Estimate the address beyond which branch targets (like __stubs and 171 // __objc_stubs) are within range of a simple forward branch. This is called 172 // exactly once, when the last input section has been finalized. 173 uint64_t 174 TextOutputSection::estimateBranchTargetThresholdVA(size_t callIdx) const { 175 // Tally the functions which still have call sites remaining to process, 176 // which yields the maximum number of thunks we might yet place. 177 size_t maxPotentialThunks = 0; 178 for (auto &tp : thunkMap) { 179 ThunkInfo &ti = tp.second; 180 // This overcounts: Only sections that are in forward jump range from the 181 // currently-active section get finalized, and all input sections are 182 // finalized when estimateBranchTargetThresholdVA() is called. So only 183 // backward jumps will need thunks, but we count all jumps. 184 if (ti.callSitesUsed < ti.callSiteCount) 185 maxPotentialThunks += 1; 186 } 187 // Tally the total size of input sections remaining to process. 188 uint64_t isecVA = inputs[callIdx]->getVA(); 189 uint64_t isecEnd = isecVA; 190 for (size_t i = callIdx; i < inputs.size(); i++) { 191 InputSection *isec = inputs[i]; 192 isecEnd = alignToPowerOf2(isecEnd, isec->align) + isec->getSize(); 193 } 194 195 // Tally up any thunks that have already been placed that have VA higher than 196 // inputs[callIdx]. First, find the index of the first thunk that is beyond 197 // the current inputs[callIdx]. 198 auto itPostcallIdxThunks = 199 llvm::partition_point(thunks, [isecVA](const ConcatInputSection *t) { 200 return t->getVA() <= isecVA; 201 }); 202 uint64_t existingForwardThunks = thunks.end() - itPostcallIdxThunks; 203 204 uint64_t forwardBranchRange = target->forwardBranchRange; 205 assert(isecEnd > forwardBranchRange && 206 "should not run thunk insertion if all code fits in jump range"); 207 assert(isecEnd - isecVA <= forwardBranchRange && 208 "should only finalize sections in jump range"); 209 210 // Estimate the maximum size of the code, right before the branch target 211 // sections. 212 uint64_t maxTextSize = 0; 213 // Add the size of all the inputs, including the unprocessed ones. 214 maxTextSize += isecEnd; 215 216 // Add the size of the thunks that have already been created that are ahead of 217 // inputs[callIdx]. These are already created thunks that will be interleaved 218 // with inputs[callIdx...end]. 219 maxTextSize += existingForwardThunks * target->thunkSize; 220 221 // Add the size of the thunks that may be created in the future. Since 222 // 'maxPotentialThunks' overcounts, this is an estimate of the upper limit. 223 maxTextSize += maxPotentialThunks * target->thunkSize; 224 225 // Calculate the total size of all late branch target sections 226 uint64_t branchTargetsSize = 0; 227 228 // Add the size of __stubs section 229 branchTargetsSize += in.stubs->getSize(); 230 231 // Add the size of __objc_stubs section if it exists 232 if (in.objcStubs && in.objcStubs->isNeeded()) 233 branchTargetsSize += in.objcStubs->getSize(); 234 235 // Estimated maximum VA of the last branch target. 236 uint64_t maxVAOfLastBranchTarget = maxTextSize + branchTargetsSize; 237 238 // Estimate the address after which call sites can safely call branch targets 239 // directly rather than through intermediary thunks. 240 uint64_t branchTargetThresholdVA = 241 maxVAOfLastBranchTarget - forwardBranchRange; 242 243 log("thunks = " + std::to_string(thunkMap.size()) + 244 ", potential = " + std::to_string(maxPotentialThunks) + 245 ", stubs = " + std::to_string(in.stubs->getSize()) + 246 (in.objcStubs && in.objcStubs->isNeeded() 247 ? ", objc_stubs = " + std::to_string(in.objcStubs->getSize()) 248 : "") + 249 ", isecVA = " + utohexstr(isecVA) + ", threshold = " + 250 utohexstr(branchTargetThresholdVA) + ", isecEnd = " + utohexstr(isecEnd) + 251 ", tail = " + utohexstr(isecEnd - isecVA) + 252 ", slop = " + utohexstr(forwardBranchRange - (isecEnd - isecVA))); 253 return branchTargetThresholdVA; 254 } 255 256 void ConcatOutputSection::finalizeOne(ConcatInputSection *isec) { 257 size = alignToPowerOf2(size, isec->align); 258 fileSize = alignToPowerOf2(fileSize, isec->align); 259 isec->outSecOff = size; 260 isec->isFinal = true; 261 size += isec->getSize(); 262 fileSize += isec->getFileSize(); 263 } 264 265 void ConcatOutputSection::finalizeContents() { 266 for (ConcatInputSection *isec : inputs) 267 finalizeOne(isec); 268 } 269 270 void TextOutputSection::finalize() { 271 if (!needsThunks()) { 272 for (ConcatInputSection *isec : inputs) 273 finalizeOne(isec); 274 return; 275 } 276 277 uint64_t forwardBranchRange = target->forwardBranchRange; 278 uint64_t backwardBranchRange = target->backwardBranchRange; 279 uint64_t branchTargetThresholdVA = TargetInfo::outOfRangeVA; 280 size_t thunkSize = target->thunkSize; 281 size_t relocCount = 0; 282 size_t callSiteCount = 0; 283 size_t thunkCallCount = 0; 284 size_t thunkCount = 0; 285 286 // Walk all sections in order. Finalize all sections that are less than 287 // forwardBranchRange in front of it. 288 // isecVA is the address of the current section. 289 // addr + size is the start address of the first non-finalized section. 290 291 // inputs[finalIdx] is for finalization (address-assignment) 292 size_t finalIdx = 0; 293 // Kick-off by ensuring that the first input section has an address 294 for (size_t callIdx = 0, endIdx = inputs.size(); callIdx < endIdx; 295 ++callIdx) { 296 if (finalIdx == callIdx) 297 finalizeOne(inputs[finalIdx++]); 298 ConcatInputSection *isec = inputs[callIdx]; 299 assert(isec->isFinal); 300 uint64_t isecVA = isec->getVA(); 301 302 // Assign addresses up-to the forward branch-range limit. 303 // Every call instruction needs a small number of bytes (on Arm64: 4), 304 // and each inserted thunk needs a slightly larger number of bytes 305 // (on Arm64: 12). If a section starts with a branch instruction and 306 // contains several branch instructions in succession, then the distance 307 // from the current position to the position where the thunks are inserted 308 // grows. So leave room for a bunch of thunks. 309 unsigned slop = 256 * thunkSize; 310 while (finalIdx < endIdx) { 311 uint64_t expectedNewSize = 312 alignToPowerOf2(addr + size, inputs[finalIdx]->align) + 313 inputs[finalIdx]->getSize(); 314 if (expectedNewSize >= isecVA + forwardBranchRange - slop) 315 break; 316 finalizeOne(inputs[finalIdx++]); 317 } 318 319 if (!isec->hasCallSites) 320 continue; 321 322 if (finalIdx == endIdx && 323 branchTargetThresholdVA == TargetInfo::outOfRangeVA) { 324 // When we have finalized all input sections, branch target sections (like 325 // __stubs and __objc_stubs) (destined to follow __text) come within range 326 // of forward branches and we can estimate the threshold address after 327 // which we can reach any branch target with a forward branch. Note that 328 // although it sits in the middle of a loop, this code executes only once. 329 // It is in the loop because we need to call it at the proper 330 // time: the earliest call site from which the end of __text 331 // (and start of branch target sections) comes within range of a forward 332 // branch. 333 branchTargetThresholdVA = estimateBranchTargetThresholdVA(callIdx); 334 } 335 // Process relocs by ascending address, i.e., ascending offset within isec 336 std::vector<Reloc> &relocs = isec->relocs; 337 // FIXME: This property does not hold for object files produced by ld64's 338 // `-r` mode. 339 assert(is_sorted(relocs, 340 [](Reloc &a, Reloc &b) { return a.offset > b.offset; })); 341 for (Reloc &r : reverse(relocs)) { 342 ++relocCount; 343 if (!target->hasAttr(r.type, RelocAttrBits::BRANCH)) 344 continue; 345 ++callSiteCount; 346 // Calculate branch reachability boundaries 347 uint64_t callVA = isecVA + r.offset; 348 uint64_t lowVA = 349 backwardBranchRange < callVA ? callVA - backwardBranchRange : 0; 350 uint64_t highVA = callVA + forwardBranchRange; 351 // Calculate our call referent address 352 auto *funcSym = cast<Symbol *>(r.referent); 353 ThunkInfo &thunkInfo = thunkMap[funcSym]; 354 // The referent is not reachable, so we need to use a thunk ... 355 if ((funcSym->isInStubs() || 356 (in.objcStubs && in.objcStubs->isNeeded() && 357 ObjCStubsSection::isObjCStubSymbol(funcSym))) && 358 callVA >= branchTargetThresholdVA) { 359 assert(callVA != TargetInfo::outOfRangeVA); 360 // ... Oh, wait! We are close enough to the end that branch target 361 // sections (__stubs, __objc_stubs) are now within range of a simple 362 // forward branch. 363 continue; 364 } 365 uint64_t funcVA = funcSym->resolveBranchVA(); 366 ++thunkInfo.callSitesUsed; 367 if (lowVA <= funcVA && funcVA <= highVA) { 368 // The referent is reachable with a simple call instruction. 369 continue; 370 } 371 ++thunkInfo.thunkCallCount; 372 ++thunkCallCount; 373 // If an existing thunk is reachable, use it ... 374 if (thunkInfo.sym) { 375 uint64_t thunkVA = thunkInfo.isec->getVA(); 376 if (lowVA <= thunkVA && thunkVA <= highVA) { 377 r.referent = thunkInfo.sym; 378 continue; 379 } 380 } 381 // ... otherwise, create a new thunk. 382 if (addr + size > highVA) { 383 // There were too many consecutive branch instructions for `slop` 384 // above. If you hit this: For the current algorithm, just bumping up 385 // slop above and trying again is probably simplest. (See also PR51578 386 // comment 5). 387 fatal(Twine(__FUNCTION__) + ": FIXME: thunk range overrun"); 388 } 389 thunkInfo.isec = 390 makeSyntheticInputSection(isec->getSegName(), isec->getName()); 391 thunkInfo.isec->parent = this; 392 assert(thunkInfo.isec->live); 393 394 StringRef thunkName = saver().save(funcSym->getName() + ".thunk." + 395 std::to_string(thunkInfo.sequence++)); 396 if (!isa<Defined>(funcSym) || cast<Defined>(funcSym)->isExternal()) { 397 r.referent = thunkInfo.sym = symtab->addDefined( 398 thunkName, /*file=*/nullptr, thunkInfo.isec, /*value=*/0, thunkSize, 399 /*isWeakDef=*/false, /*isPrivateExtern=*/true, 400 /*isReferencedDynamically=*/false, /*noDeadStrip=*/false, 401 /*isWeakDefCanBeHidden=*/false); 402 } else { 403 r.referent = thunkInfo.sym = make<Defined>( 404 thunkName, /*file=*/nullptr, thunkInfo.isec, /*value=*/0, thunkSize, 405 /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/true, 406 /*includeInSymtab=*/true, /*isReferencedDynamically=*/false, 407 /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false); 408 } 409 thunkInfo.sym->used = true; 410 target->populateThunk(thunkInfo.isec, funcSym); 411 finalizeOne(thunkInfo.isec); 412 thunks.push_back(thunkInfo.isec); 413 ++thunkCount; 414 } 415 } 416 417 log("thunks for " + parent->name + "," + name + 418 ": funcs = " + std::to_string(thunkMap.size()) + 419 ", relocs = " + std::to_string(relocCount) + 420 ", all calls = " + std::to_string(callSiteCount) + 421 ", thunk calls = " + std::to_string(thunkCallCount) + 422 ", thunks = " + std::to_string(thunkCount)); 423 } 424 425 void ConcatOutputSection::writeTo(uint8_t *buf) const { 426 for (ConcatInputSection *isec : inputs) 427 isec->writeTo(buf + isec->outSecOff); 428 } 429 430 void TextOutputSection::writeTo(uint8_t *buf) const { 431 // Merge input sections from thunk & ordinary vectors 432 size_t i = 0, ie = inputs.size(); 433 size_t t = 0, te = thunks.size(); 434 while (i < ie || t < te) { 435 while (i < ie && (t == te || inputs[i]->empty() || 436 inputs[i]->outSecOff < thunks[t]->outSecOff)) { 437 inputs[i]->writeTo(buf + inputs[i]->outSecOff); 438 ++i; 439 } 440 while (t < te && (i == ie || thunks[t]->outSecOff < inputs[i]->outSecOff)) { 441 thunks[t]->writeTo(buf + thunks[t]->outSecOff); 442 ++t; 443 } 444 } 445 } 446 447 void ConcatOutputSection::finalizeFlags(InputSection *input) { 448 switch (sectionType(input->getFlags())) { 449 default /*type-unspec'ed*/: 450 // FIXME: Add additional logic here when supporting emitting obj files. 451 break; 452 case S_4BYTE_LITERALS: 453 case S_8BYTE_LITERALS: 454 case S_16BYTE_LITERALS: 455 case S_CSTRING_LITERALS: 456 case S_ZEROFILL: 457 case S_LAZY_SYMBOL_POINTERS: 458 case S_MOD_TERM_FUNC_POINTERS: 459 case S_THREAD_LOCAL_REGULAR: 460 case S_THREAD_LOCAL_ZEROFILL: 461 case S_THREAD_LOCAL_VARIABLES: 462 case S_THREAD_LOCAL_INIT_FUNCTION_POINTERS: 463 case S_THREAD_LOCAL_VARIABLE_POINTERS: 464 case S_NON_LAZY_SYMBOL_POINTERS: 465 case S_SYMBOL_STUBS: 466 flags |= input->getFlags(); 467 break; 468 } 469 } 470 471 ConcatOutputSection * 472 ConcatOutputSection::getOrCreateForInput(const InputSection *isec) { 473 NamePair names = maybeRenameSection({isec->getSegName(), isec->getName()}); 474 ConcatOutputSection *&osec = concatOutputSections[names]; 475 if (!osec) { 476 if (isec->getSegName() == segment_names::text && 477 isec->getName() != section_names::gccExceptTab && 478 isec->getName() != section_names::ehFrame) 479 osec = make<TextOutputSection>(names.second); 480 else 481 osec = make<ConcatOutputSection>(names.second); 482 } 483 return osec; 484 } 485 486 NamePair macho::maybeRenameSection(NamePair key) { 487 auto newNames = config->sectionRenameMap.find(key); 488 if (newNames != config->sectionRenameMap.end()) 489 return newNames->second; 490 return key; 491 } 492