15ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's 105ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that 115ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact 125ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one 135ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported 145ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are 155ffd83dbSDimitry Andric // marked with an asterisk): 165ffd83dbSDimitry Andric // 175ffd83dbSDimitry Andric // +-+-+ 185ffd83dbSDimitry Andric // | | // root node 195ffd83dbSDimitry Andric // +-+-+ 205ffd83dbSDimitry Andric // | 215ffd83dbSDimitry Andric // | _ba 225ffd83dbSDimitry Andric // | 235ffd83dbSDimitry Andric // +-+-+ 245ffd83dbSDimitry Andric // | | 255ffd83dbSDimitry Andric // +-+-+ 265ffd83dbSDimitry Andric // r / \ z 275ffd83dbSDimitry Andric // / \ 285ffd83dbSDimitry Andric // +-+-+ +-+-+ 295ffd83dbSDimitry Andric // | * | | * | 305ffd83dbSDimitry Andric // +-+-+ +-+-+ 315ffd83dbSDimitry Andric // 325ffd83dbSDimitry Andric // More documentation of the format can be found in 335ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp. 345ffd83dbSDimitry Andric // 355ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 365ffd83dbSDimitry Andric 375ffd83dbSDimitry Andric #include "ExportTrie.h" 385ffd83dbSDimitry Andric #include "Symbols.h" 395ffd83dbSDimitry Andric 405ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h" 415ffd83dbSDimitry Andric #include "lld/Common/Memory.h" 425ffd83dbSDimitry Andric #include "llvm/ADT/Optional.h" 435ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h" 445ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h" 455ffd83dbSDimitry Andric 465ffd83dbSDimitry Andric using namespace llvm; 475ffd83dbSDimitry Andric using namespace llvm::MachO; 485ffd83dbSDimitry Andric using namespace lld; 495ffd83dbSDimitry Andric using namespace lld::macho; 505ffd83dbSDimitry Andric 515ffd83dbSDimitry Andric namespace { 525ffd83dbSDimitry Andric 535ffd83dbSDimitry Andric struct Edge { 545ffd83dbSDimitry Andric Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} 555ffd83dbSDimitry Andric 565ffd83dbSDimitry Andric StringRef substring; 575ffd83dbSDimitry Andric struct TrieNode *child; 585ffd83dbSDimitry Andric }; 595ffd83dbSDimitry Andric 605ffd83dbSDimitry Andric struct ExportInfo { 615ffd83dbSDimitry Andric uint64_t address; 62*e8d8bef9SDimitry Andric uint8_t flags = 0; 63*e8d8bef9SDimitry Andric ExportInfo(const Symbol &sym, uint64_t imageBase) 64*e8d8bef9SDimitry Andric : address(sym.getVA() - imageBase) { 65*e8d8bef9SDimitry Andric // Set the symbol type. 66*e8d8bef9SDimitry Andric if (sym.isWeakDef()) 67*e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; 685ffd83dbSDimitry Andric // TODO: Add proper support for re-exports & stub-and-resolver flags. 69*e8d8bef9SDimitry Andric 70*e8d8bef9SDimitry Andric // Set the symbol kind. 71*e8d8bef9SDimitry Andric if (sym.isTlv()) { 72*e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; 73*e8d8bef9SDimitry Andric } else if (auto *defined = dyn_cast<Defined>(&sym)) { 74*e8d8bef9SDimitry Andric if (defined->isAbsolute()) 75*e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; 76*e8d8bef9SDimitry Andric } 77*e8d8bef9SDimitry Andric } 785ffd83dbSDimitry Andric }; 795ffd83dbSDimitry Andric 805ffd83dbSDimitry Andric } // namespace 815ffd83dbSDimitry Andric 825ffd83dbSDimitry Andric struct macho::TrieNode { 835ffd83dbSDimitry Andric std::vector<Edge> edges; 845ffd83dbSDimitry Andric Optional<ExportInfo> info; 855ffd83dbSDimitry Andric // Estimated offset from the start of the serialized trie to the current node. 865ffd83dbSDimitry Andric // This will converge to the true offset when updateOffset() is run to a 875ffd83dbSDimitry Andric // fixpoint. 885ffd83dbSDimitry Andric size_t offset = 0; 895ffd83dbSDimitry Andric 905ffd83dbSDimitry Andric // Returns whether the new estimated offset differs from the old one. 915ffd83dbSDimitry Andric bool updateOffset(size_t &nextOffset); 925ffd83dbSDimitry Andric void writeTo(uint8_t *buf) const; 935ffd83dbSDimitry Andric }; 945ffd83dbSDimitry Andric 955ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) { 965ffd83dbSDimitry Andric // Size of the whole node (including the terminalSize and the outgoing edges.) 975ffd83dbSDimitry Andric // In contrast, terminalSize only records the size of the other data in the 985ffd83dbSDimitry Andric // node. 995ffd83dbSDimitry Andric size_t nodeSize; 1005ffd83dbSDimitry Andric if (info) { 1015ffd83dbSDimitry Andric uint32_t terminalSize = 102*e8d8bef9SDimitry Andric getULEB128Size(info->flags) + getULEB128Size(info->address); 1035ffd83dbSDimitry Andric // Overall node size so far is the uleb128 size of the length of the symbol 1045ffd83dbSDimitry Andric // info + the symbol info itself. 1055ffd83dbSDimitry Andric nodeSize = terminalSize + getULEB128Size(terminalSize); 1065ffd83dbSDimitry Andric } else { 1075ffd83dbSDimitry Andric nodeSize = 1; // Size of terminalSize (which has a value of 0) 1085ffd83dbSDimitry Andric } 1095ffd83dbSDimitry Andric // Compute size of all child edges. 1105ffd83dbSDimitry Andric ++nodeSize; // Byte for number of children. 1115ffd83dbSDimitry Andric for (Edge &edge : edges) { 1125ffd83dbSDimitry Andric nodeSize += edge.substring.size() + 1 // String length. 1135ffd83dbSDimitry Andric + getULEB128Size(edge.child->offset); // Offset len. 1145ffd83dbSDimitry Andric } 1155ffd83dbSDimitry Andric // On input, 'nextOffset' is the new preferred location for this node. 1165ffd83dbSDimitry Andric bool result = (offset != nextOffset); 1175ffd83dbSDimitry Andric // Store new location in node object for use by parents. 1185ffd83dbSDimitry Andric offset = nextOffset; 1195ffd83dbSDimitry Andric nextOffset += nodeSize; 1205ffd83dbSDimitry Andric return result; 1215ffd83dbSDimitry Andric } 1225ffd83dbSDimitry Andric 1235ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const { 1245ffd83dbSDimitry Andric buf += offset; 1255ffd83dbSDimitry Andric if (info) { 1265ffd83dbSDimitry Andric // TrieNodes with Symbol info: size, flags address 1275ffd83dbSDimitry Andric uint32_t terminalSize = 128*e8d8bef9SDimitry Andric getULEB128Size(info->flags) + getULEB128Size(info->address); 1295ffd83dbSDimitry Andric buf += encodeULEB128(terminalSize, buf); 130*e8d8bef9SDimitry Andric buf += encodeULEB128(info->flags, buf); 1315ffd83dbSDimitry Andric buf += encodeULEB128(info->address, buf); 1325ffd83dbSDimitry Andric } else { 1335ffd83dbSDimitry Andric // TrieNode with no Symbol info. 1345ffd83dbSDimitry Andric *buf++ = 0; // terminalSize 1355ffd83dbSDimitry Andric } 1365ffd83dbSDimitry Andric // Add number of children. TODO: Handle case where we have more than 256. 1375ffd83dbSDimitry Andric assert(edges.size() < 256); 1385ffd83dbSDimitry Andric *buf++ = edges.size(); 1395ffd83dbSDimitry Andric // Append each child edge substring and node offset. 1405ffd83dbSDimitry Andric for (const Edge &edge : edges) { 1415ffd83dbSDimitry Andric memcpy(buf, edge.substring.data(), edge.substring.size()); 1425ffd83dbSDimitry Andric buf += edge.substring.size(); 1435ffd83dbSDimitry Andric *buf++ = '\0'; 1445ffd83dbSDimitry Andric buf += encodeULEB128(edge.child->offset, buf); 1455ffd83dbSDimitry Andric } 1465ffd83dbSDimitry Andric } 1475ffd83dbSDimitry Andric 1485ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() { 1495ffd83dbSDimitry Andric auto *node = make<TrieNode>(); 1505ffd83dbSDimitry Andric nodes.emplace_back(node); 1515ffd83dbSDimitry Andric return node; 1525ffd83dbSDimitry Andric } 1535ffd83dbSDimitry Andric 1545ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) { 1555ffd83dbSDimitry Andric StringRef str = sym->getName(); 1565ffd83dbSDimitry Andric if (pos >= str.size()) 1575ffd83dbSDimitry Andric return -1; 1585ffd83dbSDimitry Andric return str[pos]; 1595ffd83dbSDimitry Andric } 1605ffd83dbSDimitry Andric 1615ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting 1625ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same 1635ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each 1645ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie. 1655ffd83dbSDimitry Andric // 1665ffd83dbSDimitry Andric // node: The most recently created node along this path in the trie (i.e. 1675ffd83dbSDimitry Andric // the furthest from the root.) 1685ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number 1695ffd83dbSDimitry Andric // of characters along its path from the root. 1705ffd83dbSDimitry Andric // pos: The string index we are currently sorting on. Note that each symbol 1715ffd83dbSDimitry Andric // S contained in vec has the same prefix S[0...pos). 1725ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, 1735ffd83dbSDimitry Andric TrieNode *node, size_t lastPos, size_t pos) { 1745ffd83dbSDimitry Andric tailcall: 1755ffd83dbSDimitry Andric if (vec.empty()) 1765ffd83dbSDimitry Andric return; 1775ffd83dbSDimitry Andric 1785ffd83dbSDimitry Andric // Partition items so that items in [0, i) are less than the pivot, 1795ffd83dbSDimitry Andric // [i, j) are the same as the pivot, and [j, vec.size()) are greater than 1805ffd83dbSDimitry Andric // the pivot. 1815ffd83dbSDimitry Andric const Symbol *pivotSymbol = vec[vec.size() / 2]; 1825ffd83dbSDimitry Andric int pivot = charAt(pivotSymbol, pos); 1835ffd83dbSDimitry Andric size_t i = 0; 1845ffd83dbSDimitry Andric size_t j = vec.size(); 1855ffd83dbSDimitry Andric for (size_t k = 0; k < j;) { 1865ffd83dbSDimitry Andric int c = charAt(vec[k], pos); 1875ffd83dbSDimitry Andric if (c < pivot) 1885ffd83dbSDimitry Andric std::swap(vec[i++], vec[k++]); 1895ffd83dbSDimitry Andric else if (c > pivot) 1905ffd83dbSDimitry Andric std::swap(vec[--j], vec[k]); 1915ffd83dbSDimitry Andric else 1925ffd83dbSDimitry Andric k++; 1935ffd83dbSDimitry Andric } 1945ffd83dbSDimitry Andric 1955ffd83dbSDimitry Andric bool isTerminal = pivot == -1; 1965ffd83dbSDimitry Andric bool prefixesDiverge = i != 0 || j != vec.size(); 1975ffd83dbSDimitry Andric if (lastPos != pos && (isTerminal || prefixesDiverge)) { 1985ffd83dbSDimitry Andric TrieNode *newNode = makeNode(); 1995ffd83dbSDimitry Andric node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), 2005ffd83dbSDimitry Andric newNode); 2015ffd83dbSDimitry Andric node = newNode; 2025ffd83dbSDimitry Andric lastPos = pos; 2035ffd83dbSDimitry Andric } 2045ffd83dbSDimitry Andric 2055ffd83dbSDimitry Andric sortAndBuild(vec.slice(0, i), node, lastPos, pos); 2065ffd83dbSDimitry Andric sortAndBuild(vec.slice(j), node, lastPos, pos); 2075ffd83dbSDimitry Andric 2085ffd83dbSDimitry Andric if (isTerminal) { 2095ffd83dbSDimitry Andric assert(j - i == 1); // no duplicate symbols 210*e8d8bef9SDimitry Andric node->info = ExportInfo(*pivotSymbol, imageBase); 2115ffd83dbSDimitry Andric } else { 2125ffd83dbSDimitry Andric // This is the tail-call-optimized version of the following: 2135ffd83dbSDimitry Andric // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); 2145ffd83dbSDimitry Andric vec = vec.slice(i, j - i); 2155ffd83dbSDimitry Andric ++pos; 2165ffd83dbSDimitry Andric goto tailcall; 2175ffd83dbSDimitry Andric } 2185ffd83dbSDimitry Andric } 2195ffd83dbSDimitry Andric 2205ffd83dbSDimitry Andric size_t TrieBuilder::build() { 2215ffd83dbSDimitry Andric if (exported.empty()) 2225ffd83dbSDimitry Andric return 0; 2235ffd83dbSDimitry Andric 2245ffd83dbSDimitry Andric TrieNode *root = makeNode(); 2255ffd83dbSDimitry Andric sortAndBuild(exported, root, 0, 0); 2265ffd83dbSDimitry Andric 2275ffd83dbSDimitry Andric // Assign each node in the vector an offset in the trie stream, iterating 2285ffd83dbSDimitry Andric // until all uleb128 sizes have stabilized. 2295ffd83dbSDimitry Andric size_t offset; 2305ffd83dbSDimitry Andric bool more; 2315ffd83dbSDimitry Andric do { 2325ffd83dbSDimitry Andric offset = 0; 2335ffd83dbSDimitry Andric more = false; 2345ffd83dbSDimitry Andric for (TrieNode *node : nodes) 2355ffd83dbSDimitry Andric more |= node->updateOffset(offset); 2365ffd83dbSDimitry Andric } while (more); 2375ffd83dbSDimitry Andric 2385ffd83dbSDimitry Andric return offset; 2395ffd83dbSDimitry Andric } 2405ffd83dbSDimitry Andric 2415ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const { 2425ffd83dbSDimitry Andric for (TrieNode *node : nodes) 2435ffd83dbSDimitry Andric node->writeTo(buf); 2445ffd83dbSDimitry Andric } 2455ffd83dbSDimitry Andric 2465ffd83dbSDimitry Andric namespace { 2475ffd83dbSDimitry Andric 2485ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry. 2495ffd83dbSDimitry Andric class TrieParser { 2505ffd83dbSDimitry Andric public: 2515ffd83dbSDimitry Andric TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) 2525ffd83dbSDimitry Andric : start(buf), end(start + size), callback(callback) {} 2535ffd83dbSDimitry Andric 2545ffd83dbSDimitry Andric void parse(const uint8_t *buf, const Twine &cumulativeString); 2555ffd83dbSDimitry Andric 2565ffd83dbSDimitry Andric void parse() { parse(start, ""); } 2575ffd83dbSDimitry Andric 2585ffd83dbSDimitry Andric const uint8_t *start; 2595ffd83dbSDimitry Andric const uint8_t *end; 2605ffd83dbSDimitry Andric const TrieEntryCallback &callback; 2615ffd83dbSDimitry Andric }; 2625ffd83dbSDimitry Andric 2635ffd83dbSDimitry Andric } // namespace 2645ffd83dbSDimitry Andric 2655ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { 2665ffd83dbSDimitry Andric if (buf >= end) 2675ffd83dbSDimitry Andric fatal("Node offset points outside export section"); 2685ffd83dbSDimitry Andric 2695ffd83dbSDimitry Andric unsigned ulebSize; 2705ffd83dbSDimitry Andric uint64_t terminalSize = decodeULEB128(buf, &ulebSize); 2715ffd83dbSDimitry Andric buf += ulebSize; 2725ffd83dbSDimitry Andric uint64_t flags = 0; 2735ffd83dbSDimitry Andric size_t offset; 2745ffd83dbSDimitry Andric if (terminalSize != 0) { 2755ffd83dbSDimitry Andric flags = decodeULEB128(buf, &ulebSize); 2765ffd83dbSDimitry Andric callback(cumulativeString, flags); 2775ffd83dbSDimitry Andric } 2785ffd83dbSDimitry Andric buf += terminalSize; 2795ffd83dbSDimitry Andric uint8_t numEdges = *buf++; 2805ffd83dbSDimitry Andric for (uint8_t i = 0; i < numEdges; ++i) { 2815ffd83dbSDimitry Andric const char *cbuf = reinterpret_cast<const char *>(buf); 2825ffd83dbSDimitry Andric StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf)); 2835ffd83dbSDimitry Andric buf += substring.size() + 1; 2845ffd83dbSDimitry Andric offset = decodeULEB128(buf, &ulebSize); 2855ffd83dbSDimitry Andric buf += ulebSize; 2865ffd83dbSDimitry Andric parse(start + offset, cumulativeString + substring); 2875ffd83dbSDimitry Andric } 2885ffd83dbSDimitry Andric } 2895ffd83dbSDimitry Andric 2905ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size, 2915ffd83dbSDimitry Andric const TrieEntryCallback &callback) { 2925ffd83dbSDimitry Andric if (size == 0) 2935ffd83dbSDimitry Andric return; 2945ffd83dbSDimitry Andric 2955ffd83dbSDimitry Andric TrieParser(buf, size, callback).parse(); 2965ffd83dbSDimitry Andric } 297