xref: /freebsd/contrib/llvm-project/lld/MachO/ExportTrie.cpp (revision 5ffd83dbcc34f10e07f6d3e968ae6365869615f4)
1*5ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===//
2*5ffd83dbSDimitry Andric //
3*5ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*5ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*5ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*5ffd83dbSDimitry Andric //
7*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
8*5ffd83dbSDimitry Andric //
9*5ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's
10*5ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that
11*5ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact
12*5ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one
13*5ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported
14*5ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15*5ffd83dbSDimitry Andric // marked with an asterisk):
16*5ffd83dbSDimitry Andric //
17*5ffd83dbSDimitry Andric //              +-+-+
18*5ffd83dbSDimitry Andric //              |   | // root node
19*5ffd83dbSDimitry Andric //              +-+-+
20*5ffd83dbSDimitry Andric //                |
21*5ffd83dbSDimitry Andric //                | _ba
22*5ffd83dbSDimitry Andric //                |
23*5ffd83dbSDimitry Andric //              +-+-+
24*5ffd83dbSDimitry Andric //              |   |
25*5ffd83dbSDimitry Andric //              +-+-+
26*5ffd83dbSDimitry Andric //           r /     \ z
27*5ffd83dbSDimitry Andric //            /       \
28*5ffd83dbSDimitry Andric //        +-+-+       +-+-+
29*5ffd83dbSDimitry Andric //        | * |       | * |
30*5ffd83dbSDimitry Andric //        +-+-+       +-+-+
31*5ffd83dbSDimitry Andric //
32*5ffd83dbSDimitry Andric // More documentation of the format can be found in
33*5ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp.
34*5ffd83dbSDimitry Andric //
35*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
36*5ffd83dbSDimitry Andric 
37*5ffd83dbSDimitry Andric #include "ExportTrie.h"
38*5ffd83dbSDimitry Andric #include "Symbols.h"
39*5ffd83dbSDimitry Andric 
40*5ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h"
41*5ffd83dbSDimitry Andric #include "lld/Common/Memory.h"
42*5ffd83dbSDimitry Andric #include "llvm/ADT/Optional.h"
43*5ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h"
44*5ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h"
45*5ffd83dbSDimitry Andric 
46*5ffd83dbSDimitry Andric using namespace llvm;
47*5ffd83dbSDimitry Andric using namespace llvm::MachO;
48*5ffd83dbSDimitry Andric using namespace lld;
49*5ffd83dbSDimitry Andric using namespace lld::macho;
50*5ffd83dbSDimitry Andric 
51*5ffd83dbSDimitry Andric namespace {
52*5ffd83dbSDimitry Andric 
53*5ffd83dbSDimitry Andric struct Edge {
54*5ffd83dbSDimitry Andric   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
55*5ffd83dbSDimitry Andric 
56*5ffd83dbSDimitry Andric   StringRef substring;
57*5ffd83dbSDimitry Andric   struct TrieNode *child;
58*5ffd83dbSDimitry Andric };
59*5ffd83dbSDimitry Andric 
60*5ffd83dbSDimitry Andric struct ExportInfo {
61*5ffd83dbSDimitry Andric   uint64_t address;
62*5ffd83dbSDimitry Andric   // TODO: Add proper support for re-exports & stub-and-resolver flags.
63*5ffd83dbSDimitry Andric };
64*5ffd83dbSDimitry Andric 
65*5ffd83dbSDimitry Andric } // namespace
66*5ffd83dbSDimitry Andric 
67*5ffd83dbSDimitry Andric struct macho::TrieNode {
68*5ffd83dbSDimitry Andric   std::vector<Edge> edges;
69*5ffd83dbSDimitry Andric   Optional<ExportInfo> info;
70*5ffd83dbSDimitry Andric   // Estimated offset from the start of the serialized trie to the current node.
71*5ffd83dbSDimitry Andric   // This will converge to the true offset when updateOffset() is run to a
72*5ffd83dbSDimitry Andric   // fixpoint.
73*5ffd83dbSDimitry Andric   size_t offset = 0;
74*5ffd83dbSDimitry Andric 
75*5ffd83dbSDimitry Andric   // Returns whether the new estimated offset differs from the old one.
76*5ffd83dbSDimitry Andric   bool updateOffset(size_t &nextOffset);
77*5ffd83dbSDimitry Andric   void writeTo(uint8_t *buf) const;
78*5ffd83dbSDimitry Andric };
79*5ffd83dbSDimitry Andric 
80*5ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) {
81*5ffd83dbSDimitry Andric   // Size of the whole node (including the terminalSize and the outgoing edges.)
82*5ffd83dbSDimitry Andric   // In contrast, terminalSize only records the size of the other data in the
83*5ffd83dbSDimitry Andric   // node.
84*5ffd83dbSDimitry Andric   size_t nodeSize;
85*5ffd83dbSDimitry Andric   if (info) {
86*5ffd83dbSDimitry Andric     uint64_t flags = 0;
87*5ffd83dbSDimitry Andric     uint32_t terminalSize =
88*5ffd83dbSDimitry Andric         getULEB128Size(flags) + getULEB128Size(info->address);
89*5ffd83dbSDimitry Andric     // Overall node size so far is the uleb128 size of the length of the symbol
90*5ffd83dbSDimitry Andric     // info + the symbol info itself.
91*5ffd83dbSDimitry Andric     nodeSize = terminalSize + getULEB128Size(terminalSize);
92*5ffd83dbSDimitry Andric   } else {
93*5ffd83dbSDimitry Andric     nodeSize = 1; // Size of terminalSize (which has a value of 0)
94*5ffd83dbSDimitry Andric   }
95*5ffd83dbSDimitry Andric   // Compute size of all child edges.
96*5ffd83dbSDimitry Andric   ++nodeSize; // Byte for number of children.
97*5ffd83dbSDimitry Andric   for (Edge &edge : edges) {
98*5ffd83dbSDimitry Andric     nodeSize += edge.substring.size() + 1             // String length.
99*5ffd83dbSDimitry Andric                 + getULEB128Size(edge.child->offset); // Offset len.
100*5ffd83dbSDimitry Andric   }
101*5ffd83dbSDimitry Andric   // On input, 'nextOffset' is the new preferred location for this node.
102*5ffd83dbSDimitry Andric   bool result = (offset != nextOffset);
103*5ffd83dbSDimitry Andric   // Store new location in node object for use by parents.
104*5ffd83dbSDimitry Andric   offset = nextOffset;
105*5ffd83dbSDimitry Andric   nextOffset += nodeSize;
106*5ffd83dbSDimitry Andric   return result;
107*5ffd83dbSDimitry Andric }
108*5ffd83dbSDimitry Andric 
109*5ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const {
110*5ffd83dbSDimitry Andric   buf += offset;
111*5ffd83dbSDimitry Andric   if (info) {
112*5ffd83dbSDimitry Andric     // TrieNodes with Symbol info: size, flags address
113*5ffd83dbSDimitry Andric     uint64_t flags = 0; // TODO: emit proper flags
114*5ffd83dbSDimitry Andric     uint32_t terminalSize =
115*5ffd83dbSDimitry Andric         getULEB128Size(flags) + getULEB128Size(info->address);
116*5ffd83dbSDimitry Andric     buf += encodeULEB128(terminalSize, buf);
117*5ffd83dbSDimitry Andric     buf += encodeULEB128(flags, buf);
118*5ffd83dbSDimitry Andric     buf += encodeULEB128(info->address, buf);
119*5ffd83dbSDimitry Andric   } else {
120*5ffd83dbSDimitry Andric     // TrieNode with no Symbol info.
121*5ffd83dbSDimitry Andric     *buf++ = 0; // terminalSize
122*5ffd83dbSDimitry Andric   }
123*5ffd83dbSDimitry Andric   // Add number of children. TODO: Handle case where we have more than 256.
124*5ffd83dbSDimitry Andric   assert(edges.size() < 256);
125*5ffd83dbSDimitry Andric   *buf++ = edges.size();
126*5ffd83dbSDimitry Andric   // Append each child edge substring and node offset.
127*5ffd83dbSDimitry Andric   for (const Edge &edge : edges) {
128*5ffd83dbSDimitry Andric     memcpy(buf, edge.substring.data(), edge.substring.size());
129*5ffd83dbSDimitry Andric     buf += edge.substring.size();
130*5ffd83dbSDimitry Andric     *buf++ = '\0';
131*5ffd83dbSDimitry Andric     buf += encodeULEB128(edge.child->offset, buf);
132*5ffd83dbSDimitry Andric   }
133*5ffd83dbSDimitry Andric }
134*5ffd83dbSDimitry Andric 
135*5ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() {
136*5ffd83dbSDimitry Andric   auto *node = make<TrieNode>();
137*5ffd83dbSDimitry Andric   nodes.emplace_back(node);
138*5ffd83dbSDimitry Andric   return node;
139*5ffd83dbSDimitry Andric }
140*5ffd83dbSDimitry Andric 
141*5ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) {
142*5ffd83dbSDimitry Andric   StringRef str = sym->getName();
143*5ffd83dbSDimitry Andric   if (pos >= str.size())
144*5ffd83dbSDimitry Andric     return -1;
145*5ffd83dbSDimitry Andric   return str[pos];
146*5ffd83dbSDimitry Andric }
147*5ffd83dbSDimitry Andric 
148*5ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting
149*5ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same
150*5ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each
151*5ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie.
152*5ffd83dbSDimitry Andric //
153*5ffd83dbSDimitry Andric // node:    The most recently created node along this path in the trie (i.e.
154*5ffd83dbSDimitry Andric //          the furthest from the root.)
155*5ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number
156*5ffd83dbSDimitry Andric //          of characters along its path from the root.
157*5ffd83dbSDimitry Andric // pos:     The string index we are currently sorting on. Note that each symbol
158*5ffd83dbSDimitry Andric //          S contained in vec has the same prefix S[0...pos).
159*5ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
160*5ffd83dbSDimitry Andric                                TrieNode *node, size_t lastPos, size_t pos) {
161*5ffd83dbSDimitry Andric tailcall:
162*5ffd83dbSDimitry Andric   if (vec.empty())
163*5ffd83dbSDimitry Andric     return;
164*5ffd83dbSDimitry Andric 
165*5ffd83dbSDimitry Andric   // Partition items so that items in [0, i) are less than the pivot,
166*5ffd83dbSDimitry Andric   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
167*5ffd83dbSDimitry Andric   // the pivot.
168*5ffd83dbSDimitry Andric   const Symbol *pivotSymbol = vec[vec.size() / 2];
169*5ffd83dbSDimitry Andric   int pivot = charAt(pivotSymbol, pos);
170*5ffd83dbSDimitry Andric   size_t i = 0;
171*5ffd83dbSDimitry Andric   size_t j = vec.size();
172*5ffd83dbSDimitry Andric   for (size_t k = 0; k < j;) {
173*5ffd83dbSDimitry Andric     int c = charAt(vec[k], pos);
174*5ffd83dbSDimitry Andric     if (c < pivot)
175*5ffd83dbSDimitry Andric       std::swap(vec[i++], vec[k++]);
176*5ffd83dbSDimitry Andric     else if (c > pivot)
177*5ffd83dbSDimitry Andric       std::swap(vec[--j], vec[k]);
178*5ffd83dbSDimitry Andric     else
179*5ffd83dbSDimitry Andric       k++;
180*5ffd83dbSDimitry Andric   }
181*5ffd83dbSDimitry Andric 
182*5ffd83dbSDimitry Andric   bool isTerminal = pivot == -1;
183*5ffd83dbSDimitry Andric   bool prefixesDiverge = i != 0 || j != vec.size();
184*5ffd83dbSDimitry Andric   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
185*5ffd83dbSDimitry Andric     TrieNode *newNode = makeNode();
186*5ffd83dbSDimitry Andric     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
187*5ffd83dbSDimitry Andric                              newNode);
188*5ffd83dbSDimitry Andric     node = newNode;
189*5ffd83dbSDimitry Andric     lastPos = pos;
190*5ffd83dbSDimitry Andric   }
191*5ffd83dbSDimitry Andric 
192*5ffd83dbSDimitry Andric   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
193*5ffd83dbSDimitry Andric   sortAndBuild(vec.slice(j), node, lastPos, pos);
194*5ffd83dbSDimitry Andric 
195*5ffd83dbSDimitry Andric   if (isTerminal) {
196*5ffd83dbSDimitry Andric     assert(j - i == 1); // no duplicate symbols
197*5ffd83dbSDimitry Andric     node->info = {pivotSymbol->getVA()};
198*5ffd83dbSDimitry Andric   } else {
199*5ffd83dbSDimitry Andric     // This is the tail-call-optimized version of the following:
200*5ffd83dbSDimitry Andric     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
201*5ffd83dbSDimitry Andric     vec = vec.slice(i, j - i);
202*5ffd83dbSDimitry Andric     ++pos;
203*5ffd83dbSDimitry Andric     goto tailcall;
204*5ffd83dbSDimitry Andric   }
205*5ffd83dbSDimitry Andric }
206*5ffd83dbSDimitry Andric 
207*5ffd83dbSDimitry Andric size_t TrieBuilder::build() {
208*5ffd83dbSDimitry Andric   if (exported.empty())
209*5ffd83dbSDimitry Andric     return 0;
210*5ffd83dbSDimitry Andric 
211*5ffd83dbSDimitry Andric   TrieNode *root = makeNode();
212*5ffd83dbSDimitry Andric   sortAndBuild(exported, root, 0, 0);
213*5ffd83dbSDimitry Andric 
214*5ffd83dbSDimitry Andric   // Assign each node in the vector an offset in the trie stream, iterating
215*5ffd83dbSDimitry Andric   // until all uleb128 sizes have stabilized.
216*5ffd83dbSDimitry Andric   size_t offset;
217*5ffd83dbSDimitry Andric   bool more;
218*5ffd83dbSDimitry Andric   do {
219*5ffd83dbSDimitry Andric     offset = 0;
220*5ffd83dbSDimitry Andric     more = false;
221*5ffd83dbSDimitry Andric     for (TrieNode *node : nodes)
222*5ffd83dbSDimitry Andric       more |= node->updateOffset(offset);
223*5ffd83dbSDimitry Andric   } while (more);
224*5ffd83dbSDimitry Andric 
225*5ffd83dbSDimitry Andric   return offset;
226*5ffd83dbSDimitry Andric }
227*5ffd83dbSDimitry Andric 
228*5ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const {
229*5ffd83dbSDimitry Andric   for (TrieNode *node : nodes)
230*5ffd83dbSDimitry Andric     node->writeTo(buf);
231*5ffd83dbSDimitry Andric }
232*5ffd83dbSDimitry Andric 
233*5ffd83dbSDimitry Andric namespace {
234*5ffd83dbSDimitry Andric 
235*5ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry.
236*5ffd83dbSDimitry Andric class TrieParser {
237*5ffd83dbSDimitry Andric public:
238*5ffd83dbSDimitry Andric   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
239*5ffd83dbSDimitry Andric       : start(buf), end(start + size), callback(callback) {}
240*5ffd83dbSDimitry Andric 
241*5ffd83dbSDimitry Andric   void parse(const uint8_t *buf, const Twine &cumulativeString);
242*5ffd83dbSDimitry Andric 
243*5ffd83dbSDimitry Andric   void parse() { parse(start, ""); }
244*5ffd83dbSDimitry Andric 
245*5ffd83dbSDimitry Andric   const uint8_t *start;
246*5ffd83dbSDimitry Andric   const uint8_t *end;
247*5ffd83dbSDimitry Andric   const TrieEntryCallback &callback;
248*5ffd83dbSDimitry Andric };
249*5ffd83dbSDimitry Andric 
250*5ffd83dbSDimitry Andric } // namespace
251*5ffd83dbSDimitry Andric 
252*5ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
253*5ffd83dbSDimitry Andric   if (buf >= end)
254*5ffd83dbSDimitry Andric     fatal("Node offset points outside export section");
255*5ffd83dbSDimitry Andric 
256*5ffd83dbSDimitry Andric   unsigned ulebSize;
257*5ffd83dbSDimitry Andric   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
258*5ffd83dbSDimitry Andric   buf += ulebSize;
259*5ffd83dbSDimitry Andric   uint64_t flags = 0;
260*5ffd83dbSDimitry Andric   size_t offset;
261*5ffd83dbSDimitry Andric   if (terminalSize != 0) {
262*5ffd83dbSDimitry Andric     flags = decodeULEB128(buf, &ulebSize);
263*5ffd83dbSDimitry Andric     callback(cumulativeString, flags);
264*5ffd83dbSDimitry Andric   }
265*5ffd83dbSDimitry Andric   buf += terminalSize;
266*5ffd83dbSDimitry Andric   uint8_t numEdges = *buf++;
267*5ffd83dbSDimitry Andric   for (uint8_t i = 0; i < numEdges; ++i) {
268*5ffd83dbSDimitry Andric     const char *cbuf = reinterpret_cast<const char *>(buf);
269*5ffd83dbSDimitry Andric     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
270*5ffd83dbSDimitry Andric     buf += substring.size() + 1;
271*5ffd83dbSDimitry Andric     offset = decodeULEB128(buf, &ulebSize);
272*5ffd83dbSDimitry Andric     buf += ulebSize;
273*5ffd83dbSDimitry Andric     parse(start + offset, cumulativeString + substring);
274*5ffd83dbSDimitry Andric   }
275*5ffd83dbSDimitry Andric }
276*5ffd83dbSDimitry Andric 
277*5ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size,
278*5ffd83dbSDimitry Andric                       const TrieEntryCallback &callback) {
279*5ffd83dbSDimitry Andric   if (size == 0)
280*5ffd83dbSDimitry Andric     return;
281*5ffd83dbSDimitry Andric 
282*5ffd83dbSDimitry Andric   TrieParser(buf, size, callback).parse();
283*5ffd83dbSDimitry Andric }
284