xref: /freebsd/contrib/llvm-project/lld/MachO/ExportTrie.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
15ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's
105ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that
115ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact
125ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one
135ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported
145ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are
155ffd83dbSDimitry Andric // marked with an asterisk):
165ffd83dbSDimitry Andric //
175ffd83dbSDimitry Andric //              +-+-+
185ffd83dbSDimitry Andric //              |   | // root node
195ffd83dbSDimitry Andric //              +-+-+
205ffd83dbSDimitry Andric //                |
215ffd83dbSDimitry Andric //                | _ba
225ffd83dbSDimitry Andric //                |
235ffd83dbSDimitry Andric //              +-+-+
245ffd83dbSDimitry Andric //              |   |
255ffd83dbSDimitry Andric //              +-+-+
265ffd83dbSDimitry Andric //           r /     \ z
275ffd83dbSDimitry Andric //            /       \
285ffd83dbSDimitry Andric //        +-+-+       +-+-+
295ffd83dbSDimitry Andric //        | * |       | * |
305ffd83dbSDimitry Andric //        +-+-+       +-+-+
315ffd83dbSDimitry Andric //
325ffd83dbSDimitry Andric // More documentation of the format can be found in
335ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp.
345ffd83dbSDimitry Andric //
355ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
365ffd83dbSDimitry Andric 
375ffd83dbSDimitry Andric #include "ExportTrie.h"
385ffd83dbSDimitry Andric #include "Symbols.h"
395ffd83dbSDimitry Andric 
405ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h"
415ffd83dbSDimitry Andric #include "lld/Common/Memory.h"
425ffd83dbSDimitry Andric #include "llvm/ADT/Optional.h"
435ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h"
445ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h"
455ffd83dbSDimitry Andric 
465ffd83dbSDimitry Andric using namespace llvm;
475ffd83dbSDimitry Andric using namespace llvm::MachO;
485ffd83dbSDimitry Andric using namespace lld;
495ffd83dbSDimitry Andric using namespace lld::macho;
505ffd83dbSDimitry Andric 
515ffd83dbSDimitry Andric namespace {
525ffd83dbSDimitry Andric 
535ffd83dbSDimitry Andric struct Edge {
545ffd83dbSDimitry Andric   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
555ffd83dbSDimitry Andric 
565ffd83dbSDimitry Andric   StringRef substring;
575ffd83dbSDimitry Andric   struct TrieNode *child;
585ffd83dbSDimitry Andric };
595ffd83dbSDimitry Andric 
605ffd83dbSDimitry Andric struct ExportInfo {
615ffd83dbSDimitry Andric   uint64_t address;
62*e8d8bef9SDimitry Andric   uint8_t flags = 0;
63*e8d8bef9SDimitry Andric   ExportInfo(const Symbol &sym, uint64_t imageBase)
64*e8d8bef9SDimitry Andric       : address(sym.getVA() - imageBase) {
65*e8d8bef9SDimitry Andric     // Set the symbol type.
66*e8d8bef9SDimitry Andric     if (sym.isWeakDef())
67*e8d8bef9SDimitry Andric       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
685ffd83dbSDimitry Andric     // TODO: Add proper support for re-exports & stub-and-resolver flags.
69*e8d8bef9SDimitry Andric 
70*e8d8bef9SDimitry Andric     // Set the symbol kind.
71*e8d8bef9SDimitry Andric     if (sym.isTlv()) {
72*e8d8bef9SDimitry Andric       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
73*e8d8bef9SDimitry Andric     } else if (auto *defined = dyn_cast<Defined>(&sym)) {
74*e8d8bef9SDimitry Andric       if (defined->isAbsolute())
75*e8d8bef9SDimitry Andric         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
76*e8d8bef9SDimitry Andric     }
77*e8d8bef9SDimitry Andric   }
785ffd83dbSDimitry Andric };
795ffd83dbSDimitry Andric 
805ffd83dbSDimitry Andric } // namespace
815ffd83dbSDimitry Andric 
825ffd83dbSDimitry Andric struct macho::TrieNode {
835ffd83dbSDimitry Andric   std::vector<Edge> edges;
845ffd83dbSDimitry Andric   Optional<ExportInfo> info;
855ffd83dbSDimitry Andric   // Estimated offset from the start of the serialized trie to the current node.
865ffd83dbSDimitry Andric   // This will converge to the true offset when updateOffset() is run to a
875ffd83dbSDimitry Andric   // fixpoint.
885ffd83dbSDimitry Andric   size_t offset = 0;
895ffd83dbSDimitry Andric 
905ffd83dbSDimitry Andric   // Returns whether the new estimated offset differs from the old one.
915ffd83dbSDimitry Andric   bool updateOffset(size_t &nextOffset);
925ffd83dbSDimitry Andric   void writeTo(uint8_t *buf) const;
935ffd83dbSDimitry Andric };
945ffd83dbSDimitry Andric 
955ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) {
965ffd83dbSDimitry Andric   // Size of the whole node (including the terminalSize and the outgoing edges.)
975ffd83dbSDimitry Andric   // In contrast, terminalSize only records the size of the other data in the
985ffd83dbSDimitry Andric   // node.
995ffd83dbSDimitry Andric   size_t nodeSize;
1005ffd83dbSDimitry Andric   if (info) {
1015ffd83dbSDimitry Andric     uint32_t terminalSize =
102*e8d8bef9SDimitry Andric         getULEB128Size(info->flags) + getULEB128Size(info->address);
1035ffd83dbSDimitry Andric     // Overall node size so far is the uleb128 size of the length of the symbol
1045ffd83dbSDimitry Andric     // info + the symbol info itself.
1055ffd83dbSDimitry Andric     nodeSize = terminalSize + getULEB128Size(terminalSize);
1065ffd83dbSDimitry Andric   } else {
1075ffd83dbSDimitry Andric     nodeSize = 1; // Size of terminalSize (which has a value of 0)
1085ffd83dbSDimitry Andric   }
1095ffd83dbSDimitry Andric   // Compute size of all child edges.
1105ffd83dbSDimitry Andric   ++nodeSize; // Byte for number of children.
1115ffd83dbSDimitry Andric   for (Edge &edge : edges) {
1125ffd83dbSDimitry Andric     nodeSize += edge.substring.size() + 1             // String length.
1135ffd83dbSDimitry Andric                 + getULEB128Size(edge.child->offset); // Offset len.
1145ffd83dbSDimitry Andric   }
1155ffd83dbSDimitry Andric   // On input, 'nextOffset' is the new preferred location for this node.
1165ffd83dbSDimitry Andric   bool result = (offset != nextOffset);
1175ffd83dbSDimitry Andric   // Store new location in node object for use by parents.
1185ffd83dbSDimitry Andric   offset = nextOffset;
1195ffd83dbSDimitry Andric   nextOffset += nodeSize;
1205ffd83dbSDimitry Andric   return result;
1215ffd83dbSDimitry Andric }
1225ffd83dbSDimitry Andric 
1235ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const {
1245ffd83dbSDimitry Andric   buf += offset;
1255ffd83dbSDimitry Andric   if (info) {
1265ffd83dbSDimitry Andric     // TrieNodes with Symbol info: size, flags address
1275ffd83dbSDimitry Andric     uint32_t terminalSize =
128*e8d8bef9SDimitry Andric         getULEB128Size(info->flags) + getULEB128Size(info->address);
1295ffd83dbSDimitry Andric     buf += encodeULEB128(terminalSize, buf);
130*e8d8bef9SDimitry Andric     buf += encodeULEB128(info->flags, buf);
1315ffd83dbSDimitry Andric     buf += encodeULEB128(info->address, buf);
1325ffd83dbSDimitry Andric   } else {
1335ffd83dbSDimitry Andric     // TrieNode with no Symbol info.
1345ffd83dbSDimitry Andric     *buf++ = 0; // terminalSize
1355ffd83dbSDimitry Andric   }
1365ffd83dbSDimitry Andric   // Add number of children. TODO: Handle case where we have more than 256.
1375ffd83dbSDimitry Andric   assert(edges.size() < 256);
1385ffd83dbSDimitry Andric   *buf++ = edges.size();
1395ffd83dbSDimitry Andric   // Append each child edge substring and node offset.
1405ffd83dbSDimitry Andric   for (const Edge &edge : edges) {
1415ffd83dbSDimitry Andric     memcpy(buf, edge.substring.data(), edge.substring.size());
1425ffd83dbSDimitry Andric     buf += edge.substring.size();
1435ffd83dbSDimitry Andric     *buf++ = '\0';
1445ffd83dbSDimitry Andric     buf += encodeULEB128(edge.child->offset, buf);
1455ffd83dbSDimitry Andric   }
1465ffd83dbSDimitry Andric }
1475ffd83dbSDimitry Andric 
1485ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() {
1495ffd83dbSDimitry Andric   auto *node = make<TrieNode>();
1505ffd83dbSDimitry Andric   nodes.emplace_back(node);
1515ffd83dbSDimitry Andric   return node;
1525ffd83dbSDimitry Andric }
1535ffd83dbSDimitry Andric 
1545ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) {
1555ffd83dbSDimitry Andric   StringRef str = sym->getName();
1565ffd83dbSDimitry Andric   if (pos >= str.size())
1575ffd83dbSDimitry Andric     return -1;
1585ffd83dbSDimitry Andric   return str[pos];
1595ffd83dbSDimitry Andric }
1605ffd83dbSDimitry Andric 
1615ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting
1625ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same
1635ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each
1645ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie.
1655ffd83dbSDimitry Andric //
1665ffd83dbSDimitry Andric // node:    The most recently created node along this path in the trie (i.e.
1675ffd83dbSDimitry Andric //          the furthest from the root.)
1685ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number
1695ffd83dbSDimitry Andric //          of characters along its path from the root.
1705ffd83dbSDimitry Andric // pos:     The string index we are currently sorting on. Note that each symbol
1715ffd83dbSDimitry Andric //          S contained in vec has the same prefix S[0...pos).
1725ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
1735ffd83dbSDimitry Andric                                TrieNode *node, size_t lastPos, size_t pos) {
1745ffd83dbSDimitry Andric tailcall:
1755ffd83dbSDimitry Andric   if (vec.empty())
1765ffd83dbSDimitry Andric     return;
1775ffd83dbSDimitry Andric 
1785ffd83dbSDimitry Andric   // Partition items so that items in [0, i) are less than the pivot,
1795ffd83dbSDimitry Andric   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
1805ffd83dbSDimitry Andric   // the pivot.
1815ffd83dbSDimitry Andric   const Symbol *pivotSymbol = vec[vec.size() / 2];
1825ffd83dbSDimitry Andric   int pivot = charAt(pivotSymbol, pos);
1835ffd83dbSDimitry Andric   size_t i = 0;
1845ffd83dbSDimitry Andric   size_t j = vec.size();
1855ffd83dbSDimitry Andric   for (size_t k = 0; k < j;) {
1865ffd83dbSDimitry Andric     int c = charAt(vec[k], pos);
1875ffd83dbSDimitry Andric     if (c < pivot)
1885ffd83dbSDimitry Andric       std::swap(vec[i++], vec[k++]);
1895ffd83dbSDimitry Andric     else if (c > pivot)
1905ffd83dbSDimitry Andric       std::swap(vec[--j], vec[k]);
1915ffd83dbSDimitry Andric     else
1925ffd83dbSDimitry Andric       k++;
1935ffd83dbSDimitry Andric   }
1945ffd83dbSDimitry Andric 
1955ffd83dbSDimitry Andric   bool isTerminal = pivot == -1;
1965ffd83dbSDimitry Andric   bool prefixesDiverge = i != 0 || j != vec.size();
1975ffd83dbSDimitry Andric   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
1985ffd83dbSDimitry Andric     TrieNode *newNode = makeNode();
1995ffd83dbSDimitry Andric     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
2005ffd83dbSDimitry Andric                              newNode);
2015ffd83dbSDimitry Andric     node = newNode;
2025ffd83dbSDimitry Andric     lastPos = pos;
2035ffd83dbSDimitry Andric   }
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
2065ffd83dbSDimitry Andric   sortAndBuild(vec.slice(j), node, lastPos, pos);
2075ffd83dbSDimitry Andric 
2085ffd83dbSDimitry Andric   if (isTerminal) {
2095ffd83dbSDimitry Andric     assert(j - i == 1); // no duplicate symbols
210*e8d8bef9SDimitry Andric     node->info = ExportInfo(*pivotSymbol, imageBase);
2115ffd83dbSDimitry Andric   } else {
2125ffd83dbSDimitry Andric     // This is the tail-call-optimized version of the following:
2135ffd83dbSDimitry Andric     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
2145ffd83dbSDimitry Andric     vec = vec.slice(i, j - i);
2155ffd83dbSDimitry Andric     ++pos;
2165ffd83dbSDimitry Andric     goto tailcall;
2175ffd83dbSDimitry Andric   }
2185ffd83dbSDimitry Andric }
2195ffd83dbSDimitry Andric 
2205ffd83dbSDimitry Andric size_t TrieBuilder::build() {
2215ffd83dbSDimitry Andric   if (exported.empty())
2225ffd83dbSDimitry Andric     return 0;
2235ffd83dbSDimitry Andric 
2245ffd83dbSDimitry Andric   TrieNode *root = makeNode();
2255ffd83dbSDimitry Andric   sortAndBuild(exported, root, 0, 0);
2265ffd83dbSDimitry Andric 
2275ffd83dbSDimitry Andric   // Assign each node in the vector an offset in the trie stream, iterating
2285ffd83dbSDimitry Andric   // until all uleb128 sizes have stabilized.
2295ffd83dbSDimitry Andric   size_t offset;
2305ffd83dbSDimitry Andric   bool more;
2315ffd83dbSDimitry Andric   do {
2325ffd83dbSDimitry Andric     offset = 0;
2335ffd83dbSDimitry Andric     more = false;
2345ffd83dbSDimitry Andric     for (TrieNode *node : nodes)
2355ffd83dbSDimitry Andric       more |= node->updateOffset(offset);
2365ffd83dbSDimitry Andric   } while (more);
2375ffd83dbSDimitry Andric 
2385ffd83dbSDimitry Andric   return offset;
2395ffd83dbSDimitry Andric }
2405ffd83dbSDimitry Andric 
2415ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const {
2425ffd83dbSDimitry Andric   for (TrieNode *node : nodes)
2435ffd83dbSDimitry Andric     node->writeTo(buf);
2445ffd83dbSDimitry Andric }
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric namespace {
2475ffd83dbSDimitry Andric 
2485ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry.
2495ffd83dbSDimitry Andric class TrieParser {
2505ffd83dbSDimitry Andric public:
2515ffd83dbSDimitry Andric   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
2525ffd83dbSDimitry Andric       : start(buf), end(start + size), callback(callback) {}
2535ffd83dbSDimitry Andric 
2545ffd83dbSDimitry Andric   void parse(const uint8_t *buf, const Twine &cumulativeString);
2555ffd83dbSDimitry Andric 
2565ffd83dbSDimitry Andric   void parse() { parse(start, ""); }
2575ffd83dbSDimitry Andric 
2585ffd83dbSDimitry Andric   const uint8_t *start;
2595ffd83dbSDimitry Andric   const uint8_t *end;
2605ffd83dbSDimitry Andric   const TrieEntryCallback &callback;
2615ffd83dbSDimitry Andric };
2625ffd83dbSDimitry Andric 
2635ffd83dbSDimitry Andric } // namespace
2645ffd83dbSDimitry Andric 
2655ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
2665ffd83dbSDimitry Andric   if (buf >= end)
2675ffd83dbSDimitry Andric     fatal("Node offset points outside export section");
2685ffd83dbSDimitry Andric 
2695ffd83dbSDimitry Andric   unsigned ulebSize;
2705ffd83dbSDimitry Andric   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
2715ffd83dbSDimitry Andric   buf += ulebSize;
2725ffd83dbSDimitry Andric   uint64_t flags = 0;
2735ffd83dbSDimitry Andric   size_t offset;
2745ffd83dbSDimitry Andric   if (terminalSize != 0) {
2755ffd83dbSDimitry Andric     flags = decodeULEB128(buf, &ulebSize);
2765ffd83dbSDimitry Andric     callback(cumulativeString, flags);
2775ffd83dbSDimitry Andric   }
2785ffd83dbSDimitry Andric   buf += terminalSize;
2795ffd83dbSDimitry Andric   uint8_t numEdges = *buf++;
2805ffd83dbSDimitry Andric   for (uint8_t i = 0; i < numEdges; ++i) {
2815ffd83dbSDimitry Andric     const char *cbuf = reinterpret_cast<const char *>(buf);
2825ffd83dbSDimitry Andric     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
2835ffd83dbSDimitry Andric     buf += substring.size() + 1;
2845ffd83dbSDimitry Andric     offset = decodeULEB128(buf, &ulebSize);
2855ffd83dbSDimitry Andric     buf += ulebSize;
2865ffd83dbSDimitry Andric     parse(start + offset, cumulativeString + substring);
2875ffd83dbSDimitry Andric   }
2885ffd83dbSDimitry Andric }
2895ffd83dbSDimitry Andric 
2905ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size,
2915ffd83dbSDimitry Andric                       const TrieEntryCallback &callback) {
2925ffd83dbSDimitry Andric   if (size == 0)
2935ffd83dbSDimitry Andric     return;
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric   TrieParser(buf, size, callback).parse();
2965ffd83dbSDimitry Andric }
297