15ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's
105ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that
115ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact
125ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one
135ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported
145ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are
155ffd83dbSDimitry Andric // marked with an asterisk):
165ffd83dbSDimitry Andric //
175ffd83dbSDimitry Andric // +-+-+
185ffd83dbSDimitry Andric // | | // root node
195ffd83dbSDimitry Andric // +-+-+
205ffd83dbSDimitry Andric // |
215ffd83dbSDimitry Andric // | _ba
225ffd83dbSDimitry Andric // |
235ffd83dbSDimitry Andric // +-+-+
245ffd83dbSDimitry Andric // | |
255ffd83dbSDimitry Andric // +-+-+
265ffd83dbSDimitry Andric // r / \ z
275ffd83dbSDimitry Andric // / \
285ffd83dbSDimitry Andric // +-+-+ +-+-+
295ffd83dbSDimitry Andric // | * | | * |
305ffd83dbSDimitry Andric // +-+-+ +-+-+
315ffd83dbSDimitry Andric //
325ffd83dbSDimitry Andric // More documentation of the format can be found in
335ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp.
345ffd83dbSDimitry Andric //
355ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
365ffd83dbSDimitry Andric
375ffd83dbSDimitry Andric #include "ExportTrie.h"
385ffd83dbSDimitry Andric #include "Symbols.h"
395ffd83dbSDimitry Andric
405ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h"
415ffd83dbSDimitry Andric #include "lld/Common/Memory.h"
425ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h"
435ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h"
44bdd1243dSDimitry Andric #include <optional>
455ffd83dbSDimitry Andric
465ffd83dbSDimitry Andric using namespace llvm;
475ffd83dbSDimitry Andric using namespace lld;
485ffd83dbSDimitry Andric using namespace lld::macho;
495ffd83dbSDimitry Andric
505ffd83dbSDimitry Andric namespace {
515ffd83dbSDimitry Andric
525ffd83dbSDimitry Andric struct Edge {
Edge__anonf3d890b80111::Edge535ffd83dbSDimitry Andric Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
545ffd83dbSDimitry Andric
555ffd83dbSDimitry Andric StringRef substring;
565ffd83dbSDimitry Andric struct TrieNode *child;
575ffd83dbSDimitry Andric };
585ffd83dbSDimitry Andric
595ffd83dbSDimitry Andric struct ExportInfo {
605ffd83dbSDimitry Andric uint64_t address;
61*06c3fb27SDimitry Andric uint64_t ordinal = 0;
62e8d8bef9SDimitry Andric uint8_t flags = 0;
ExportInfo__anonf3d890b80111::ExportInfo63e8d8bef9SDimitry Andric ExportInfo(const Symbol &sym, uint64_t imageBase)
64e8d8bef9SDimitry Andric : address(sym.getVA() - imageBase) {
65fe6060f1SDimitry Andric using namespace llvm::MachO;
66e8d8bef9SDimitry Andric if (sym.isWeakDef())
67e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68*06c3fb27SDimitry Andric if (sym.isTlv())
69e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
70*06c3fb27SDimitry Andric // TODO: Add proper support for stub-and-resolver flags.
71*06c3fb27SDimitry Andric
72*06c3fb27SDimitry Andric if (auto *defined = dyn_cast<Defined>(&sym)) {
73e8d8bef9SDimitry Andric if (defined->isAbsolute())
74e8d8bef9SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
75*06c3fb27SDimitry Andric } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {
76*06c3fb27SDimitry Andric flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
77*06c3fb27SDimitry Andric if (!dysym->isDynamicLookup())
78*06c3fb27SDimitry Andric ordinal = dysym->getFile()->ordinal;
79e8d8bef9SDimitry Andric }
80e8d8bef9SDimitry Andric }
815ffd83dbSDimitry Andric };
825ffd83dbSDimitry Andric
835ffd83dbSDimitry Andric } // namespace
845ffd83dbSDimitry Andric
855ffd83dbSDimitry Andric struct macho::TrieNode {
865ffd83dbSDimitry Andric std::vector<Edge> edges;
87bdd1243dSDimitry Andric std::optional<ExportInfo> info;
885ffd83dbSDimitry Andric // Estimated offset from the start of the serialized trie to the current node.
895ffd83dbSDimitry Andric // This will converge to the true offset when updateOffset() is run to a
905ffd83dbSDimitry Andric // fixpoint.
915ffd83dbSDimitry Andric size_t offset = 0;
925ffd83dbSDimitry Andric
93*06c3fb27SDimitry Andric uint32_t getTerminalSize() const;
945ffd83dbSDimitry Andric // Returns whether the new estimated offset differs from the old one.
955ffd83dbSDimitry Andric bool updateOffset(size_t &nextOffset);
965ffd83dbSDimitry Andric void writeTo(uint8_t *buf) const;
975ffd83dbSDimitry Andric };
985ffd83dbSDimitry Andric
99*06c3fb27SDimitry Andric // For regular symbols, the node layout (excluding the children) is
100*06c3fb27SDimitry Andric //
101*06c3fb27SDimitry Andric // uleb128 terminalSize;
102*06c3fb27SDimitry Andric // uleb128 flags;
103*06c3fb27SDimitry Andric // uleb128 address;
104*06c3fb27SDimitry Andric //
105*06c3fb27SDimitry Andric // For re-exported symbols, the layout is
106*06c3fb27SDimitry Andric //
107*06c3fb27SDimitry Andric // uleb128 terminalSize;
108*06c3fb27SDimitry Andric // uleb128 flags;
109*06c3fb27SDimitry Andric // uleb128 ordinal;
110*06c3fb27SDimitry Andric // char[] originalName;
111*06c3fb27SDimitry Andric //
112*06c3fb27SDimitry Andric // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
113*06c3fb27SDimitry Andric // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
114*06c3fb27SDimitry Andric // re-exports _bar directly (i.e. not via an alias), then originalName will be
115*06c3fb27SDimitry Andric // the empty string.
116*06c3fb27SDimitry Andric //
117*06c3fb27SDimitry Andric // TODO: Support aliased re-exports. (Since we don't yet support these,
118*06c3fb27SDimitry Andric // originalName will always be the empty string.)
119*06c3fb27SDimitry Andric //
120*06c3fb27SDimitry Andric // For stub-and-resolver nodes, the layout is
121*06c3fb27SDimitry Andric //
122*06c3fb27SDimitry Andric // uleb128 terminalSize;
123*06c3fb27SDimitry Andric // uleb128 flags;
124*06c3fb27SDimitry Andric // uleb128 stubAddress;
125*06c3fb27SDimitry Andric // uleb128 resolverAddress;
126*06c3fb27SDimitry Andric //
127*06c3fb27SDimitry Andric // TODO: Support stub-and-resolver nodes.
getTerminalSize() const128*06c3fb27SDimitry Andric uint32_t TrieNode::getTerminalSize() const {
129*06c3fb27SDimitry Andric uint32_t size = getULEB128Size(info->flags);
130*06c3fb27SDimitry Andric if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
131*06c3fb27SDimitry Andric size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator
132*06c3fb27SDimitry Andric else
133*06c3fb27SDimitry Andric size += getULEB128Size(info->address);
134*06c3fb27SDimitry Andric return size;
135*06c3fb27SDimitry Andric }
136*06c3fb27SDimitry Andric
updateOffset(size_t & nextOffset)1375ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) {
1385ffd83dbSDimitry Andric // Size of the whole node (including the terminalSize and the outgoing edges.)
1395ffd83dbSDimitry Andric // In contrast, terminalSize only records the size of the other data in the
1405ffd83dbSDimitry Andric // node.
1415ffd83dbSDimitry Andric size_t nodeSize;
1425ffd83dbSDimitry Andric if (info) {
143*06c3fb27SDimitry Andric uint32_t terminalSize = getTerminalSize();
1445ffd83dbSDimitry Andric // Overall node size so far is the uleb128 size of the length of the symbol
1455ffd83dbSDimitry Andric // info + the symbol info itself.
1465ffd83dbSDimitry Andric nodeSize = terminalSize + getULEB128Size(terminalSize);
1475ffd83dbSDimitry Andric } else {
1485ffd83dbSDimitry Andric nodeSize = 1; // Size of terminalSize (which has a value of 0)
1495ffd83dbSDimitry Andric }
1505ffd83dbSDimitry Andric // Compute size of all child edges.
1515ffd83dbSDimitry Andric ++nodeSize; // Byte for number of children.
152fe6060f1SDimitry Andric for (const Edge &edge : edges) {
1535ffd83dbSDimitry Andric nodeSize += edge.substring.size() + 1 // String length.
1545ffd83dbSDimitry Andric + getULEB128Size(edge.child->offset); // Offset len.
1555ffd83dbSDimitry Andric }
1565ffd83dbSDimitry Andric // On input, 'nextOffset' is the new preferred location for this node.
1575ffd83dbSDimitry Andric bool result = (offset != nextOffset);
1585ffd83dbSDimitry Andric // Store new location in node object for use by parents.
1595ffd83dbSDimitry Andric offset = nextOffset;
1605ffd83dbSDimitry Andric nextOffset += nodeSize;
1615ffd83dbSDimitry Andric return result;
1625ffd83dbSDimitry Andric }
1635ffd83dbSDimitry Andric
writeTo(uint8_t * buf) const1645ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const {
1655ffd83dbSDimitry Andric buf += offset;
1665ffd83dbSDimitry Andric if (info) {
167*06c3fb27SDimitry Andric uint32_t terminalSize = getTerminalSize();
1685ffd83dbSDimitry Andric buf += encodeULEB128(terminalSize, buf);
169e8d8bef9SDimitry Andric buf += encodeULEB128(info->flags, buf);
170*06c3fb27SDimitry Andric if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
171*06c3fb27SDimitry Andric buf += encodeULEB128(info->ordinal, buf);
172*06c3fb27SDimitry Andric *buf++ = 0; // empty originalName string
173*06c3fb27SDimitry Andric } else {
1745ffd83dbSDimitry Andric buf += encodeULEB128(info->address, buf);
175*06c3fb27SDimitry Andric }
1765ffd83dbSDimitry Andric } else {
1775ffd83dbSDimitry Andric // TrieNode with no Symbol info.
1785ffd83dbSDimitry Andric *buf++ = 0; // terminalSize
1795ffd83dbSDimitry Andric }
1805ffd83dbSDimitry Andric // Add number of children. TODO: Handle case where we have more than 256.
1815ffd83dbSDimitry Andric assert(edges.size() < 256);
1825ffd83dbSDimitry Andric *buf++ = edges.size();
1835ffd83dbSDimitry Andric // Append each child edge substring and node offset.
1845ffd83dbSDimitry Andric for (const Edge &edge : edges) {
1855ffd83dbSDimitry Andric memcpy(buf, edge.substring.data(), edge.substring.size());
1865ffd83dbSDimitry Andric buf += edge.substring.size();
1875ffd83dbSDimitry Andric *buf++ = '\0';
1885ffd83dbSDimitry Andric buf += encodeULEB128(edge.child->offset, buf);
1895ffd83dbSDimitry Andric }
1905ffd83dbSDimitry Andric }
1915ffd83dbSDimitry Andric
~TrieBuilder()19281ad6265SDimitry Andric TrieBuilder::~TrieBuilder() {
19381ad6265SDimitry Andric for (TrieNode *node : nodes)
19481ad6265SDimitry Andric delete node;
19581ad6265SDimitry Andric }
19681ad6265SDimitry Andric
makeNode()1975ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() {
19881ad6265SDimitry Andric auto *node = new TrieNode();
1995ffd83dbSDimitry Andric nodes.emplace_back(node);
2005ffd83dbSDimitry Andric return node;
2015ffd83dbSDimitry Andric }
2025ffd83dbSDimitry Andric
charAt(const Symbol * sym,size_t pos)2035ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) {
2045ffd83dbSDimitry Andric StringRef str = sym->getName();
2055ffd83dbSDimitry Andric if (pos >= str.size())
2065ffd83dbSDimitry Andric return -1;
2075ffd83dbSDimitry Andric return str[pos];
2085ffd83dbSDimitry Andric }
2095ffd83dbSDimitry Andric
2105ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting
2115ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same
2125ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each
2135ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie.
2145ffd83dbSDimitry Andric //
2155ffd83dbSDimitry Andric // node: The most recently created node along this path in the trie (i.e.
2165ffd83dbSDimitry Andric // the furthest from the root.)
2175ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number
2185ffd83dbSDimitry Andric // of characters along its path from the root.
2195ffd83dbSDimitry Andric // pos: The string index we are currently sorting on. Note that each symbol
2205ffd83dbSDimitry Andric // S contained in vec has the same prefix S[0...pos).
sortAndBuild(MutableArrayRef<const Symbol * > vec,TrieNode * node,size_t lastPos,size_t pos)2215ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
2225ffd83dbSDimitry Andric TrieNode *node, size_t lastPos, size_t pos) {
2235ffd83dbSDimitry Andric tailcall:
2245ffd83dbSDimitry Andric if (vec.empty())
2255ffd83dbSDimitry Andric return;
2265ffd83dbSDimitry Andric
2275ffd83dbSDimitry Andric // Partition items so that items in [0, i) are less than the pivot,
2285ffd83dbSDimitry Andric // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
2295ffd83dbSDimitry Andric // the pivot.
2305ffd83dbSDimitry Andric const Symbol *pivotSymbol = vec[vec.size() / 2];
2315ffd83dbSDimitry Andric int pivot = charAt(pivotSymbol, pos);
2325ffd83dbSDimitry Andric size_t i = 0;
2335ffd83dbSDimitry Andric size_t j = vec.size();
2345ffd83dbSDimitry Andric for (size_t k = 0; k < j;) {
2355ffd83dbSDimitry Andric int c = charAt(vec[k], pos);
2365ffd83dbSDimitry Andric if (c < pivot)
2375ffd83dbSDimitry Andric std::swap(vec[i++], vec[k++]);
2385ffd83dbSDimitry Andric else if (c > pivot)
2395ffd83dbSDimitry Andric std::swap(vec[--j], vec[k]);
2405ffd83dbSDimitry Andric else
2415ffd83dbSDimitry Andric k++;
2425ffd83dbSDimitry Andric }
2435ffd83dbSDimitry Andric
2445ffd83dbSDimitry Andric bool isTerminal = pivot == -1;
2455ffd83dbSDimitry Andric bool prefixesDiverge = i != 0 || j != vec.size();
2465ffd83dbSDimitry Andric if (lastPos != pos && (isTerminal || prefixesDiverge)) {
2475ffd83dbSDimitry Andric TrieNode *newNode = makeNode();
2485ffd83dbSDimitry Andric node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
2495ffd83dbSDimitry Andric newNode);
2505ffd83dbSDimitry Andric node = newNode;
2515ffd83dbSDimitry Andric lastPos = pos;
2525ffd83dbSDimitry Andric }
2535ffd83dbSDimitry Andric
2545ffd83dbSDimitry Andric sortAndBuild(vec.slice(0, i), node, lastPos, pos);
2555ffd83dbSDimitry Andric sortAndBuild(vec.slice(j), node, lastPos, pos);
2565ffd83dbSDimitry Andric
2575ffd83dbSDimitry Andric if (isTerminal) {
2585ffd83dbSDimitry Andric assert(j - i == 1); // no duplicate symbols
259e8d8bef9SDimitry Andric node->info = ExportInfo(*pivotSymbol, imageBase);
2605ffd83dbSDimitry Andric } else {
2615ffd83dbSDimitry Andric // This is the tail-call-optimized version of the following:
2625ffd83dbSDimitry Andric // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
2635ffd83dbSDimitry Andric vec = vec.slice(i, j - i);
2645ffd83dbSDimitry Andric ++pos;
2655ffd83dbSDimitry Andric goto tailcall;
2665ffd83dbSDimitry Andric }
2675ffd83dbSDimitry Andric }
2685ffd83dbSDimitry Andric
build()2695ffd83dbSDimitry Andric size_t TrieBuilder::build() {
2705ffd83dbSDimitry Andric if (exported.empty())
2715ffd83dbSDimitry Andric return 0;
2725ffd83dbSDimitry Andric
2735ffd83dbSDimitry Andric TrieNode *root = makeNode();
2745ffd83dbSDimitry Andric sortAndBuild(exported, root, 0, 0);
2755ffd83dbSDimitry Andric
2765ffd83dbSDimitry Andric // Assign each node in the vector an offset in the trie stream, iterating
2775ffd83dbSDimitry Andric // until all uleb128 sizes have stabilized.
2785ffd83dbSDimitry Andric size_t offset;
2795ffd83dbSDimitry Andric bool more;
2805ffd83dbSDimitry Andric do {
2815ffd83dbSDimitry Andric offset = 0;
2825ffd83dbSDimitry Andric more = false;
2835ffd83dbSDimitry Andric for (TrieNode *node : nodes)
2845ffd83dbSDimitry Andric more |= node->updateOffset(offset);
2855ffd83dbSDimitry Andric } while (more);
2865ffd83dbSDimitry Andric
2875ffd83dbSDimitry Andric return offset;
2885ffd83dbSDimitry Andric }
2895ffd83dbSDimitry Andric
writeTo(uint8_t * buf) const2905ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const {
2915ffd83dbSDimitry Andric for (TrieNode *node : nodes)
2925ffd83dbSDimitry Andric node->writeTo(buf);
2935ffd83dbSDimitry Andric }
2945ffd83dbSDimitry Andric
2955ffd83dbSDimitry Andric namespace {
2965ffd83dbSDimitry Andric
2975ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry.
2985ffd83dbSDimitry Andric class TrieParser {
2995ffd83dbSDimitry Andric public:
TrieParser(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)3005ffd83dbSDimitry Andric TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
3015ffd83dbSDimitry Andric : start(buf), end(start + size), callback(callback) {}
3025ffd83dbSDimitry Andric
3035ffd83dbSDimitry Andric void parse(const uint8_t *buf, const Twine &cumulativeString);
3045ffd83dbSDimitry Andric
parse()3055ffd83dbSDimitry Andric void parse() { parse(start, ""); }
3065ffd83dbSDimitry Andric
3075ffd83dbSDimitry Andric const uint8_t *start;
3085ffd83dbSDimitry Andric const uint8_t *end;
3095ffd83dbSDimitry Andric const TrieEntryCallback &callback;
3105ffd83dbSDimitry Andric };
3115ffd83dbSDimitry Andric
3125ffd83dbSDimitry Andric } // namespace
3135ffd83dbSDimitry Andric
parse(const uint8_t * buf,const Twine & cumulativeString)3145ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
3155ffd83dbSDimitry Andric if (buf >= end)
3165ffd83dbSDimitry Andric fatal("Node offset points outside export section");
3175ffd83dbSDimitry Andric
3185ffd83dbSDimitry Andric unsigned ulebSize;
3195ffd83dbSDimitry Andric uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
3205ffd83dbSDimitry Andric buf += ulebSize;
3215ffd83dbSDimitry Andric uint64_t flags = 0;
3225ffd83dbSDimitry Andric size_t offset;
3235ffd83dbSDimitry Andric if (terminalSize != 0) {
3245ffd83dbSDimitry Andric flags = decodeULEB128(buf, &ulebSize);
3255ffd83dbSDimitry Andric callback(cumulativeString, flags);
3265ffd83dbSDimitry Andric }
3275ffd83dbSDimitry Andric buf += terminalSize;
3285ffd83dbSDimitry Andric uint8_t numEdges = *buf++;
3295ffd83dbSDimitry Andric for (uint8_t i = 0; i < numEdges; ++i) {
3305ffd83dbSDimitry Andric const char *cbuf = reinterpret_cast<const char *>(buf);
3315ffd83dbSDimitry Andric StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
3325ffd83dbSDimitry Andric buf += substring.size() + 1;
3335ffd83dbSDimitry Andric offset = decodeULEB128(buf, &ulebSize);
3345ffd83dbSDimitry Andric buf += ulebSize;
3355ffd83dbSDimitry Andric parse(start + offset, cumulativeString + substring);
3365ffd83dbSDimitry Andric }
3375ffd83dbSDimitry Andric }
3385ffd83dbSDimitry Andric
parseTrie(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)3395ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size,
3405ffd83dbSDimitry Andric const TrieEntryCallback &callback) {
3415ffd83dbSDimitry Andric if (size == 0)
3425ffd83dbSDimitry Andric return;
3435ffd83dbSDimitry Andric
3445ffd83dbSDimitry Andric TrieParser(buf, size, callback).parse();
3455ffd83dbSDimitry Andric }
346