xref: /freebsd/contrib/llvm-project/lld/MachO/ExportTrie.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
15ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's
105ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that
115ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact
125ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one
135ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported
145ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are
155ffd83dbSDimitry Andric // marked with an asterisk):
165ffd83dbSDimitry Andric //
175ffd83dbSDimitry Andric //              +-+-+
185ffd83dbSDimitry Andric //              |   | // root node
195ffd83dbSDimitry Andric //              +-+-+
205ffd83dbSDimitry Andric //                |
215ffd83dbSDimitry Andric //                | _ba
225ffd83dbSDimitry Andric //                |
235ffd83dbSDimitry Andric //              +-+-+
245ffd83dbSDimitry Andric //              |   |
255ffd83dbSDimitry Andric //              +-+-+
265ffd83dbSDimitry Andric //           r /     \ z
275ffd83dbSDimitry Andric //            /       \
285ffd83dbSDimitry Andric //        +-+-+       +-+-+
295ffd83dbSDimitry Andric //        | * |       | * |
305ffd83dbSDimitry Andric //        +-+-+       +-+-+
315ffd83dbSDimitry Andric //
325ffd83dbSDimitry Andric // More documentation of the format can be found in
335ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp.
345ffd83dbSDimitry Andric //
355ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
365ffd83dbSDimitry Andric 
375ffd83dbSDimitry Andric #include "ExportTrie.h"
385ffd83dbSDimitry Andric #include "Symbols.h"
395ffd83dbSDimitry Andric 
405ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h"
415ffd83dbSDimitry Andric #include "lld/Common/Memory.h"
425ffd83dbSDimitry Andric #include "llvm/ADT/Optional.h"
435ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h"
445ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h"
455ffd83dbSDimitry Andric 
465ffd83dbSDimitry Andric using namespace llvm;
475ffd83dbSDimitry Andric using namespace lld;
485ffd83dbSDimitry Andric using namespace lld::macho;
495ffd83dbSDimitry Andric 
505ffd83dbSDimitry Andric namespace {
515ffd83dbSDimitry Andric 
525ffd83dbSDimitry Andric struct Edge {
535ffd83dbSDimitry Andric   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
545ffd83dbSDimitry Andric 
555ffd83dbSDimitry Andric   StringRef substring;
565ffd83dbSDimitry Andric   struct TrieNode *child;
575ffd83dbSDimitry Andric };
585ffd83dbSDimitry Andric 
595ffd83dbSDimitry Andric struct ExportInfo {
605ffd83dbSDimitry Andric   uint64_t address;
61e8d8bef9SDimitry Andric   uint8_t flags = 0;
62e8d8bef9SDimitry Andric   ExportInfo(const Symbol &sym, uint64_t imageBase)
63e8d8bef9SDimitry Andric       : address(sym.getVA() - imageBase) {
64fe6060f1SDimitry Andric     using namespace llvm::MachO;
65e8d8bef9SDimitry Andric     // Set the symbol type.
66e8d8bef9SDimitry Andric     if (sym.isWeakDef())
67e8d8bef9SDimitry Andric       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
685ffd83dbSDimitry Andric     // TODO: Add proper support for re-exports & stub-and-resolver flags.
69e8d8bef9SDimitry Andric 
70e8d8bef9SDimitry Andric     // Set the symbol kind.
71e8d8bef9SDimitry Andric     if (sym.isTlv()) {
72e8d8bef9SDimitry Andric       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
73e8d8bef9SDimitry Andric     } else if (auto *defined = dyn_cast<Defined>(&sym)) {
74e8d8bef9SDimitry Andric       if (defined->isAbsolute())
75e8d8bef9SDimitry Andric         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
76e8d8bef9SDimitry Andric     }
77e8d8bef9SDimitry Andric   }
785ffd83dbSDimitry Andric };
795ffd83dbSDimitry Andric 
805ffd83dbSDimitry Andric } // namespace
815ffd83dbSDimitry Andric 
825ffd83dbSDimitry Andric struct macho::TrieNode {
835ffd83dbSDimitry Andric   std::vector<Edge> edges;
845ffd83dbSDimitry Andric   Optional<ExportInfo> info;
855ffd83dbSDimitry Andric   // Estimated offset from the start of the serialized trie to the current node.
865ffd83dbSDimitry Andric   // This will converge to the true offset when updateOffset() is run to a
875ffd83dbSDimitry Andric   // fixpoint.
885ffd83dbSDimitry Andric   size_t offset = 0;
895ffd83dbSDimitry Andric 
905ffd83dbSDimitry Andric   // Returns whether the new estimated offset differs from the old one.
915ffd83dbSDimitry Andric   bool updateOffset(size_t &nextOffset);
925ffd83dbSDimitry Andric   void writeTo(uint8_t *buf) const;
935ffd83dbSDimitry Andric };
945ffd83dbSDimitry Andric 
955ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) {
965ffd83dbSDimitry Andric   // Size of the whole node (including the terminalSize and the outgoing edges.)
975ffd83dbSDimitry Andric   // In contrast, terminalSize only records the size of the other data in the
985ffd83dbSDimitry Andric   // node.
995ffd83dbSDimitry Andric   size_t nodeSize;
1005ffd83dbSDimitry Andric   if (info) {
1015ffd83dbSDimitry Andric     uint32_t terminalSize =
102e8d8bef9SDimitry Andric         getULEB128Size(info->flags) + getULEB128Size(info->address);
1035ffd83dbSDimitry Andric     // Overall node size so far is the uleb128 size of the length of the symbol
1045ffd83dbSDimitry Andric     // info + the symbol info itself.
1055ffd83dbSDimitry Andric     nodeSize = terminalSize + getULEB128Size(terminalSize);
1065ffd83dbSDimitry Andric   } else {
1075ffd83dbSDimitry Andric     nodeSize = 1; // Size of terminalSize (which has a value of 0)
1085ffd83dbSDimitry Andric   }
1095ffd83dbSDimitry Andric   // Compute size of all child edges.
1105ffd83dbSDimitry Andric   ++nodeSize; // Byte for number of children.
111fe6060f1SDimitry Andric   for (const Edge &edge : edges) {
1125ffd83dbSDimitry Andric     nodeSize += edge.substring.size() + 1             // String length.
1135ffd83dbSDimitry Andric                 + getULEB128Size(edge.child->offset); // Offset len.
1145ffd83dbSDimitry Andric   }
1155ffd83dbSDimitry Andric   // On input, 'nextOffset' is the new preferred location for this node.
1165ffd83dbSDimitry Andric   bool result = (offset != nextOffset);
1175ffd83dbSDimitry Andric   // Store new location in node object for use by parents.
1185ffd83dbSDimitry Andric   offset = nextOffset;
1195ffd83dbSDimitry Andric   nextOffset += nodeSize;
1205ffd83dbSDimitry Andric   return result;
1215ffd83dbSDimitry Andric }
1225ffd83dbSDimitry Andric 
1235ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const {
1245ffd83dbSDimitry Andric   buf += offset;
1255ffd83dbSDimitry Andric   if (info) {
1265ffd83dbSDimitry Andric     // TrieNodes with Symbol info: size, flags address
1275ffd83dbSDimitry Andric     uint32_t terminalSize =
128e8d8bef9SDimitry Andric         getULEB128Size(info->flags) + getULEB128Size(info->address);
1295ffd83dbSDimitry Andric     buf += encodeULEB128(terminalSize, buf);
130e8d8bef9SDimitry Andric     buf += encodeULEB128(info->flags, buf);
1315ffd83dbSDimitry Andric     buf += encodeULEB128(info->address, buf);
1325ffd83dbSDimitry Andric   } else {
1335ffd83dbSDimitry Andric     // TrieNode with no Symbol info.
1345ffd83dbSDimitry Andric     *buf++ = 0; // terminalSize
1355ffd83dbSDimitry Andric   }
1365ffd83dbSDimitry Andric   // Add number of children. TODO: Handle case where we have more than 256.
1375ffd83dbSDimitry Andric   assert(edges.size() < 256);
1385ffd83dbSDimitry Andric   *buf++ = edges.size();
1395ffd83dbSDimitry Andric   // Append each child edge substring and node offset.
1405ffd83dbSDimitry Andric   for (const Edge &edge : edges) {
1415ffd83dbSDimitry Andric     memcpy(buf, edge.substring.data(), edge.substring.size());
1425ffd83dbSDimitry Andric     buf += edge.substring.size();
1435ffd83dbSDimitry Andric     *buf++ = '\0';
1445ffd83dbSDimitry Andric     buf += encodeULEB128(edge.child->offset, buf);
1455ffd83dbSDimitry Andric   }
1465ffd83dbSDimitry Andric }
1475ffd83dbSDimitry Andric 
148*81ad6265SDimitry Andric TrieBuilder::~TrieBuilder() {
149*81ad6265SDimitry Andric   for (TrieNode *node : nodes)
150*81ad6265SDimitry Andric     delete node;
151*81ad6265SDimitry Andric }
152*81ad6265SDimitry Andric 
1535ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() {
154*81ad6265SDimitry Andric   auto *node = new TrieNode();
1555ffd83dbSDimitry Andric   nodes.emplace_back(node);
1565ffd83dbSDimitry Andric   return node;
1575ffd83dbSDimitry Andric }
1585ffd83dbSDimitry Andric 
1595ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) {
1605ffd83dbSDimitry Andric   StringRef str = sym->getName();
1615ffd83dbSDimitry Andric   if (pos >= str.size())
1625ffd83dbSDimitry Andric     return -1;
1635ffd83dbSDimitry Andric   return str[pos];
1645ffd83dbSDimitry Andric }
1655ffd83dbSDimitry Andric 
1665ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting
1675ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same
1685ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each
1695ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie.
1705ffd83dbSDimitry Andric //
1715ffd83dbSDimitry Andric // node:    The most recently created node along this path in the trie (i.e.
1725ffd83dbSDimitry Andric //          the furthest from the root.)
1735ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number
1745ffd83dbSDimitry Andric //          of characters along its path from the root.
1755ffd83dbSDimitry Andric // pos:     The string index we are currently sorting on. Note that each symbol
1765ffd83dbSDimitry Andric //          S contained in vec has the same prefix S[0...pos).
1775ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
1785ffd83dbSDimitry Andric                                TrieNode *node, size_t lastPos, size_t pos) {
1795ffd83dbSDimitry Andric tailcall:
1805ffd83dbSDimitry Andric   if (vec.empty())
1815ffd83dbSDimitry Andric     return;
1825ffd83dbSDimitry Andric 
1835ffd83dbSDimitry Andric   // Partition items so that items in [0, i) are less than the pivot,
1845ffd83dbSDimitry Andric   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
1855ffd83dbSDimitry Andric   // the pivot.
1865ffd83dbSDimitry Andric   const Symbol *pivotSymbol = vec[vec.size() / 2];
1875ffd83dbSDimitry Andric   int pivot = charAt(pivotSymbol, pos);
1885ffd83dbSDimitry Andric   size_t i = 0;
1895ffd83dbSDimitry Andric   size_t j = vec.size();
1905ffd83dbSDimitry Andric   for (size_t k = 0; k < j;) {
1915ffd83dbSDimitry Andric     int c = charAt(vec[k], pos);
1925ffd83dbSDimitry Andric     if (c < pivot)
1935ffd83dbSDimitry Andric       std::swap(vec[i++], vec[k++]);
1945ffd83dbSDimitry Andric     else if (c > pivot)
1955ffd83dbSDimitry Andric       std::swap(vec[--j], vec[k]);
1965ffd83dbSDimitry Andric     else
1975ffd83dbSDimitry Andric       k++;
1985ffd83dbSDimitry Andric   }
1995ffd83dbSDimitry Andric 
2005ffd83dbSDimitry Andric   bool isTerminal = pivot == -1;
2015ffd83dbSDimitry Andric   bool prefixesDiverge = i != 0 || j != vec.size();
2025ffd83dbSDimitry Andric   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
2035ffd83dbSDimitry Andric     TrieNode *newNode = makeNode();
2045ffd83dbSDimitry Andric     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
2055ffd83dbSDimitry Andric                              newNode);
2065ffd83dbSDimitry Andric     node = newNode;
2075ffd83dbSDimitry Andric     lastPos = pos;
2085ffd83dbSDimitry Andric   }
2095ffd83dbSDimitry Andric 
2105ffd83dbSDimitry Andric   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
2115ffd83dbSDimitry Andric   sortAndBuild(vec.slice(j), node, lastPos, pos);
2125ffd83dbSDimitry Andric 
2135ffd83dbSDimitry Andric   if (isTerminal) {
2145ffd83dbSDimitry Andric     assert(j - i == 1); // no duplicate symbols
215e8d8bef9SDimitry Andric     node->info = ExportInfo(*pivotSymbol, imageBase);
2165ffd83dbSDimitry Andric   } else {
2175ffd83dbSDimitry Andric     // This is the tail-call-optimized version of the following:
2185ffd83dbSDimitry Andric     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
2195ffd83dbSDimitry Andric     vec = vec.slice(i, j - i);
2205ffd83dbSDimitry Andric     ++pos;
2215ffd83dbSDimitry Andric     goto tailcall;
2225ffd83dbSDimitry Andric   }
2235ffd83dbSDimitry Andric }
2245ffd83dbSDimitry Andric 
2255ffd83dbSDimitry Andric size_t TrieBuilder::build() {
2265ffd83dbSDimitry Andric   if (exported.empty())
2275ffd83dbSDimitry Andric     return 0;
2285ffd83dbSDimitry Andric 
2295ffd83dbSDimitry Andric   TrieNode *root = makeNode();
2305ffd83dbSDimitry Andric   sortAndBuild(exported, root, 0, 0);
2315ffd83dbSDimitry Andric 
2325ffd83dbSDimitry Andric   // Assign each node in the vector an offset in the trie stream, iterating
2335ffd83dbSDimitry Andric   // until all uleb128 sizes have stabilized.
2345ffd83dbSDimitry Andric   size_t offset;
2355ffd83dbSDimitry Andric   bool more;
2365ffd83dbSDimitry Andric   do {
2375ffd83dbSDimitry Andric     offset = 0;
2385ffd83dbSDimitry Andric     more = false;
2395ffd83dbSDimitry Andric     for (TrieNode *node : nodes)
2405ffd83dbSDimitry Andric       more |= node->updateOffset(offset);
2415ffd83dbSDimitry Andric   } while (more);
2425ffd83dbSDimitry Andric 
2435ffd83dbSDimitry Andric   return offset;
2445ffd83dbSDimitry Andric }
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const {
2475ffd83dbSDimitry Andric   for (TrieNode *node : nodes)
2485ffd83dbSDimitry Andric     node->writeTo(buf);
2495ffd83dbSDimitry Andric }
2505ffd83dbSDimitry Andric 
2515ffd83dbSDimitry Andric namespace {
2525ffd83dbSDimitry Andric 
2535ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry.
2545ffd83dbSDimitry Andric class TrieParser {
2555ffd83dbSDimitry Andric public:
2565ffd83dbSDimitry Andric   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
2575ffd83dbSDimitry Andric       : start(buf), end(start + size), callback(callback) {}
2585ffd83dbSDimitry Andric 
2595ffd83dbSDimitry Andric   void parse(const uint8_t *buf, const Twine &cumulativeString);
2605ffd83dbSDimitry Andric 
2615ffd83dbSDimitry Andric   void parse() { parse(start, ""); }
2625ffd83dbSDimitry Andric 
2635ffd83dbSDimitry Andric   const uint8_t *start;
2645ffd83dbSDimitry Andric   const uint8_t *end;
2655ffd83dbSDimitry Andric   const TrieEntryCallback &callback;
2665ffd83dbSDimitry Andric };
2675ffd83dbSDimitry Andric 
2685ffd83dbSDimitry Andric } // namespace
2695ffd83dbSDimitry Andric 
2705ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
2715ffd83dbSDimitry Andric   if (buf >= end)
2725ffd83dbSDimitry Andric     fatal("Node offset points outside export section");
2735ffd83dbSDimitry Andric 
2745ffd83dbSDimitry Andric   unsigned ulebSize;
2755ffd83dbSDimitry Andric   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
2765ffd83dbSDimitry Andric   buf += ulebSize;
2775ffd83dbSDimitry Andric   uint64_t flags = 0;
2785ffd83dbSDimitry Andric   size_t offset;
2795ffd83dbSDimitry Andric   if (terminalSize != 0) {
2805ffd83dbSDimitry Andric     flags = decodeULEB128(buf, &ulebSize);
2815ffd83dbSDimitry Andric     callback(cumulativeString, flags);
2825ffd83dbSDimitry Andric   }
2835ffd83dbSDimitry Andric   buf += terminalSize;
2845ffd83dbSDimitry Andric   uint8_t numEdges = *buf++;
2855ffd83dbSDimitry Andric   for (uint8_t i = 0; i < numEdges; ++i) {
2865ffd83dbSDimitry Andric     const char *cbuf = reinterpret_cast<const char *>(buf);
2875ffd83dbSDimitry Andric     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
2885ffd83dbSDimitry Andric     buf += substring.size() + 1;
2895ffd83dbSDimitry Andric     offset = decodeULEB128(buf, &ulebSize);
2905ffd83dbSDimitry Andric     buf += ulebSize;
2915ffd83dbSDimitry Andric     parse(start + offset, cumulativeString + substring);
2925ffd83dbSDimitry Andric   }
2935ffd83dbSDimitry Andric }
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size,
2965ffd83dbSDimitry Andric                       const TrieEntryCallback &callback) {
2975ffd83dbSDimitry Andric   if (size == 0)
2985ffd83dbSDimitry Andric     return;
2995ffd83dbSDimitry Andric 
3005ffd83dbSDimitry Andric   TrieParser(buf, size, callback).parse();
3015ffd83dbSDimitry Andric }
302