xref: /freebsd/contrib/llvm-project/lld/MachO/ExportTrie.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 //              +-+-+
18 //              |   | // root node
19 //              +-+-+
20 //                |
21 //                | _ba
22 //                |
23 //              +-+-+
24 //              |   |
25 //              +-+-+
26 //           r /     \ z
27 //            /       \
28 //        +-+-+       +-+-+
29 //        | * |       | * |
30 //        +-+-+       +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39 
40 #include "lld/Common/ErrorHandler.h"
41 #include "llvm/BinaryFormat/MachO.h"
42 #include "llvm/Support/LEB128.h"
43 #include <optional>
44 
45 using namespace llvm;
46 using namespace lld;
47 using namespace lld::macho;
48 
49 namespace {
50 
51 struct Edge {
Edge__anonf3d890b80111::Edge52   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
53 
54   StringRef substring;
55   struct TrieNode *child;
56 };
57 
58 struct ExportInfo {
59   uint64_t address;
60   uint64_t ordinal = 0;
61   uint8_t flags = 0;
ExportInfo__anonf3d890b80111::ExportInfo62   ExportInfo(const Symbol &sym, uint64_t imageBase)
63       : address(sym.getVA() - imageBase) {
64     using namespace llvm::MachO;
65     if (sym.isWeakDef())
66       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
67     if (sym.isTlv())
68       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
69     // TODO: Add proper support for stub-and-resolver flags.
70 
71     if (auto *defined = dyn_cast<Defined>(&sym)) {
72       if (defined->isAbsolute())
73         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
74     } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {
75       flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
76       if (!dysym->isDynamicLookup())
77         ordinal = dysym->getFile()->ordinal;
78     }
79   }
80 };
81 
82 } // namespace
83 
84 struct macho::TrieNode {
85   std::vector<Edge> edges;
86   std::optional<ExportInfo> info;
87   // Estimated offset from the start of the serialized trie to the current node.
88   // This will converge to the true offset when updateOffset() is run to a
89   // fixpoint.
90   size_t offset = 0;
91 
92   uint32_t getTerminalSize() const;
93   // Returns whether the new estimated offset differs from the old one.
94   bool updateOffset(size_t &nextOffset);
95   void writeTo(uint8_t *buf) const;
96 };
97 
98 // For regular symbols, the node layout (excluding the children) is
99 //
100 //   uleb128 terminalSize;
101 //   uleb128 flags;
102 //   uleb128 address;
103 //
104 // For re-exported symbols, the layout is
105 //
106 //   uleb128 terminalSize;
107 //   uleb128 flags;
108 //   uleb128 ordinal;
109 //   char[] originalName;
110 //
111 // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
112 // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
113 // re-exports _bar directly (i.e. not via an alias), then originalName will be
114 // the empty string.
115 //
116 // TODO: Support aliased re-exports. (Since we don't yet support these,
117 // originalName will always be the empty string.)
118 //
119 // For stub-and-resolver nodes, the layout is
120 //
121 //   uleb128 terminalSize;
122 //   uleb128 flags;
123 //   uleb128 stubAddress;
124 //   uleb128 resolverAddress;
125 //
126 // TODO: Support stub-and-resolver nodes.
getTerminalSize() const127 uint32_t TrieNode::getTerminalSize() const {
128   uint32_t size = getULEB128Size(info->flags);
129   if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
130     size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator
131   else
132     size += getULEB128Size(info->address);
133   return size;
134 }
135 
updateOffset(size_t & nextOffset)136 bool TrieNode::updateOffset(size_t &nextOffset) {
137   // Size of the whole node (including the terminalSize and the outgoing edges.)
138   // In contrast, terminalSize only records the size of the other data in the
139   // node.
140   size_t nodeSize;
141   if (info) {
142     uint32_t terminalSize = getTerminalSize();
143     // Overall node size so far is the uleb128 size of the length of the symbol
144     // info + the symbol info itself.
145     nodeSize = terminalSize + getULEB128Size(terminalSize);
146   } else {
147     nodeSize = 1; // Size of terminalSize (which has a value of 0)
148   }
149   // Compute size of all child edges.
150   ++nodeSize; // Byte for number of children.
151   for (const Edge &edge : edges) {
152     nodeSize += edge.substring.size() + 1             // String length.
153                 + getULEB128Size(edge.child->offset); // Offset len.
154   }
155   // On input, 'nextOffset' is the new preferred location for this node.
156   bool result = (offset != nextOffset);
157   // Store new location in node object for use by parents.
158   offset = nextOffset;
159   nextOffset += nodeSize;
160   return result;
161 }
162 
writeTo(uint8_t * buf) const163 void TrieNode::writeTo(uint8_t *buf) const {
164   buf += offset;
165   if (info) {
166     uint32_t terminalSize = getTerminalSize();
167     buf += encodeULEB128(terminalSize, buf);
168     buf += encodeULEB128(info->flags, buf);
169     if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
170       buf += encodeULEB128(info->ordinal, buf);
171       *buf++ = 0; // empty originalName string
172     } else {
173       buf += encodeULEB128(info->address, buf);
174     }
175   } else {
176     // TrieNode with no Symbol info.
177     *buf++ = 0; // terminalSize
178   }
179   // Add number of children. TODO: Handle case where we have more than 256.
180   assert(edges.size() < 256);
181   *buf++ = edges.size();
182   // Append each child edge substring and node offset.
183   for (const Edge &edge : edges) {
184     memcpy(buf, edge.substring.data(), edge.substring.size());
185     buf += edge.substring.size();
186     *buf++ = '\0';
187     buf += encodeULEB128(edge.child->offset, buf);
188   }
189 }
190 
~TrieBuilder()191 TrieBuilder::~TrieBuilder() {
192   for (TrieNode *node : nodes)
193     delete node;
194 }
195 
makeNode()196 TrieNode *TrieBuilder::makeNode() {
197   auto *node = new TrieNode();
198   nodes.emplace_back(node);
199   return node;
200 }
201 
charAt(const Symbol * sym,size_t pos)202 static int charAt(const Symbol *sym, size_t pos) {
203   StringRef str = sym->getName();
204   if (pos >= str.size())
205     return -1;
206   return str[pos];
207 }
208 
209 // Build the trie by performing a three-way radix quicksort: We start by sorting
210 // the strings by their first characters, then sort the strings with the same
211 // first characters by their second characters, and so on recursively. Each
212 // time the prefixes diverge, we add a node to the trie.
213 //
214 // node:    The most recently created node along this path in the trie (i.e.
215 //          the furthest from the root.)
216 // lastPos: The prefix length of the most recently created node, i.e. the number
217 //          of characters along its path from the root.
218 // pos:     The string index we are currently sorting on. Note that each symbol
219 //          S contained in vec has the same prefix S[0...pos).
sortAndBuild(MutableArrayRef<const Symbol * > vec,TrieNode * node,size_t lastPos,size_t pos)220 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
221                                TrieNode *node, size_t lastPos, size_t pos) {
222 tailcall:
223   if (vec.empty())
224     return;
225 
226   // Partition items so that items in [0, i) are less than the pivot,
227   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
228   // the pivot.
229   const Symbol *pivotSymbol = vec[vec.size() / 2];
230   int pivot = charAt(pivotSymbol, pos);
231   size_t i = 0;
232   size_t j = vec.size();
233   for (size_t k = 0; k < j;) {
234     int c = charAt(vec[k], pos);
235     if (c < pivot)
236       std::swap(vec[i++], vec[k++]);
237     else if (c > pivot)
238       std::swap(vec[--j], vec[k]);
239     else
240       k++;
241   }
242 
243   bool isTerminal = pivot == -1;
244   bool prefixesDiverge = i != 0 || j != vec.size();
245   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
246     TrieNode *newNode = makeNode();
247     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
248                              newNode);
249     node = newNode;
250     lastPos = pos;
251   }
252 
253   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
254   sortAndBuild(vec.slice(j), node, lastPos, pos);
255 
256   if (isTerminal) {
257     assert(j - i == 1); // no duplicate symbols
258     node->info = ExportInfo(*pivotSymbol, imageBase);
259   } else {
260     // This is the tail-call-optimized version of the following:
261     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
262     vec = vec.slice(i, j - i);
263     ++pos;
264     goto tailcall;
265   }
266 }
267 
build()268 size_t TrieBuilder::build() {
269   if (exported.empty())
270     return 0;
271 
272   TrieNode *root = makeNode();
273   sortAndBuild(exported, root, 0, 0);
274 
275   // Assign each node in the vector an offset in the trie stream, iterating
276   // until all uleb128 sizes have stabilized.
277   size_t offset;
278   bool more;
279   do {
280     offset = 0;
281     more = false;
282     for (TrieNode *node : nodes)
283       more |= node->updateOffset(offset);
284   } while (more);
285 
286   return offset;
287 }
288 
writeTo(uint8_t * buf) const289 void TrieBuilder::writeTo(uint8_t *buf) const {
290   for (TrieNode *node : nodes)
291     node->writeTo(buf);
292 }
293 
294 namespace {
295 
296 // Parse a serialized trie and invoke a callback for each entry.
297 class TrieParser {
298 public:
TrieParser(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)299   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
300       : start(buf), end(start + size), callback(callback) {}
301 
302   void parse(const uint8_t *buf, const Twine &cumulativeString);
303 
parse()304   void parse() { parse(start, ""); }
305 
306   const uint8_t *start;
307   const uint8_t *end;
308   const TrieEntryCallback &callback;
309 };
310 
311 } // namespace
312 
parse(const uint8_t * buf,const Twine & cumulativeString)313 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
314   if (buf >= end)
315     fatal("Node offset points outside export section");
316 
317   unsigned ulebSize;
318   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
319   buf += ulebSize;
320   uint64_t flags = 0;
321   size_t offset;
322   if (terminalSize != 0) {
323     flags = decodeULEB128(buf, &ulebSize);
324     callback(cumulativeString, flags);
325   }
326   buf += terminalSize;
327   uint8_t numEdges = *buf++;
328   for (uint8_t i = 0; i < numEdges; ++i) {
329     const char *cbuf = reinterpret_cast<const char *>(buf);
330     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
331     buf += substring.size() + 1;
332     offset = decodeULEB128(buf, &ulebSize);
333     buf += ulebSize;
334     parse(start + offset, cumulativeString + substring);
335   }
336 }
337 
parseTrie(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)338 void macho::parseTrie(const uint8_t *buf, size_t size,
339                       const TrieEntryCallback &callback) {
340   if (size == 0)
341     return;
342 
343   TrieParser(buf, size, callback).parse();
344 }
345