1 //===- ExportTrie.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
16 //
17 // +-+-+
18 // | | // root node
19 // +-+-+
20 // |
21 // | _ba
22 // |
23 // +-+-+
24 // | |
25 // +-+-+
26 // r / \ z
27 // / \
28 // +-+-+ +-+-+
29 // | * | | * |
30 // +-+-+ +-+-+
31 //
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
34 //
35 //===----------------------------------------------------------------------===//
36
37 #include "ExportTrie.h"
38 #include "Symbols.h"
39
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/BinaryFormat/MachO.h"
43 #include "llvm/Support/LEB128.h"
44 #include <optional>
45
46 using namespace llvm;
47 using namespace lld;
48 using namespace lld::macho;
49
50 namespace {
51
52 struct Edge {
Edge__anonf3d890b80111::Edge53 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54
55 StringRef substring;
56 struct TrieNode *child;
57 };
58
59 struct ExportInfo {
60 uint64_t address;
61 uint64_t ordinal = 0;
62 uint8_t flags = 0;
ExportInfo__anonf3d890b80111::ExportInfo63 ExportInfo(const Symbol &sym, uint64_t imageBase)
64 : address(sym.getVA() - imageBase) {
65 using namespace llvm::MachO;
66 if (sym.isWeakDef())
67 flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68 if (sym.isTlv())
69 flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
70 // TODO: Add proper support for stub-and-resolver flags.
71
72 if (auto *defined = dyn_cast<Defined>(&sym)) {
73 if (defined->isAbsolute())
74 flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
75 } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) {
76 flags |= EXPORT_SYMBOL_FLAGS_REEXPORT;
77 if (!dysym->isDynamicLookup())
78 ordinal = dysym->getFile()->ordinal;
79 }
80 }
81 };
82
83 } // namespace
84
85 struct macho::TrieNode {
86 std::vector<Edge> edges;
87 std::optional<ExportInfo> info;
88 // Estimated offset from the start of the serialized trie to the current node.
89 // This will converge to the true offset when updateOffset() is run to a
90 // fixpoint.
91 size_t offset = 0;
92
93 uint32_t getTerminalSize() const;
94 // Returns whether the new estimated offset differs from the old one.
95 bool updateOffset(size_t &nextOffset);
96 void writeTo(uint8_t *buf) const;
97 };
98
99 // For regular symbols, the node layout (excluding the children) is
100 //
101 // uleb128 terminalSize;
102 // uleb128 flags;
103 // uleb128 address;
104 //
105 // For re-exported symbols, the layout is
106 //
107 // uleb128 terminalSize;
108 // uleb128 flags;
109 // uleb128 ordinal;
110 // char[] originalName;
111 //
112 // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
113 // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
114 // re-exports _bar directly (i.e. not via an alias), then originalName will be
115 // the empty string.
116 //
117 // TODO: Support aliased re-exports. (Since we don't yet support these,
118 // originalName will always be the empty string.)
119 //
120 // For stub-and-resolver nodes, the layout is
121 //
122 // uleb128 terminalSize;
123 // uleb128 flags;
124 // uleb128 stubAddress;
125 // uleb128 resolverAddress;
126 //
127 // TODO: Support stub-and-resolver nodes.
getTerminalSize() const128 uint32_t TrieNode::getTerminalSize() const {
129 uint32_t size = getULEB128Size(info->flags);
130 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT)
131 size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator
132 else
133 size += getULEB128Size(info->address);
134 return size;
135 }
136
updateOffset(size_t & nextOffset)137 bool TrieNode::updateOffset(size_t &nextOffset) {
138 // Size of the whole node (including the terminalSize and the outgoing edges.)
139 // In contrast, terminalSize only records the size of the other data in the
140 // node.
141 size_t nodeSize;
142 if (info) {
143 uint32_t terminalSize = getTerminalSize();
144 // Overall node size so far is the uleb128 size of the length of the symbol
145 // info + the symbol info itself.
146 nodeSize = terminalSize + getULEB128Size(terminalSize);
147 } else {
148 nodeSize = 1; // Size of terminalSize (which has a value of 0)
149 }
150 // Compute size of all child edges.
151 ++nodeSize; // Byte for number of children.
152 for (const Edge &edge : edges) {
153 nodeSize += edge.substring.size() + 1 // String length.
154 + getULEB128Size(edge.child->offset); // Offset len.
155 }
156 // On input, 'nextOffset' is the new preferred location for this node.
157 bool result = (offset != nextOffset);
158 // Store new location in node object for use by parents.
159 offset = nextOffset;
160 nextOffset += nodeSize;
161 return result;
162 }
163
writeTo(uint8_t * buf) const164 void TrieNode::writeTo(uint8_t *buf) const {
165 buf += offset;
166 if (info) {
167 uint32_t terminalSize = getTerminalSize();
168 buf += encodeULEB128(terminalSize, buf);
169 buf += encodeULEB128(info->flags, buf);
170 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
171 buf += encodeULEB128(info->ordinal, buf);
172 *buf++ = 0; // empty originalName string
173 } else {
174 buf += encodeULEB128(info->address, buf);
175 }
176 } else {
177 // TrieNode with no Symbol info.
178 *buf++ = 0; // terminalSize
179 }
180 // Add number of children. TODO: Handle case where we have more than 256.
181 assert(edges.size() < 256);
182 *buf++ = edges.size();
183 // Append each child edge substring and node offset.
184 for (const Edge &edge : edges) {
185 memcpy(buf, edge.substring.data(), edge.substring.size());
186 buf += edge.substring.size();
187 *buf++ = '\0';
188 buf += encodeULEB128(edge.child->offset, buf);
189 }
190 }
191
~TrieBuilder()192 TrieBuilder::~TrieBuilder() {
193 for (TrieNode *node : nodes)
194 delete node;
195 }
196
makeNode()197 TrieNode *TrieBuilder::makeNode() {
198 auto *node = new TrieNode();
199 nodes.emplace_back(node);
200 return node;
201 }
202
charAt(const Symbol * sym,size_t pos)203 static int charAt(const Symbol *sym, size_t pos) {
204 StringRef str = sym->getName();
205 if (pos >= str.size())
206 return -1;
207 return str[pos];
208 }
209
210 // Build the trie by performing a three-way radix quicksort: We start by sorting
211 // the strings by their first characters, then sort the strings with the same
212 // first characters by their second characters, and so on recursively. Each
213 // time the prefixes diverge, we add a node to the trie.
214 //
215 // node: The most recently created node along this path in the trie (i.e.
216 // the furthest from the root.)
217 // lastPos: The prefix length of the most recently created node, i.e. the number
218 // of characters along its path from the root.
219 // pos: The string index we are currently sorting on. Note that each symbol
220 // S contained in vec has the same prefix S[0...pos).
sortAndBuild(MutableArrayRef<const Symbol * > vec,TrieNode * node,size_t lastPos,size_t pos)221 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
222 TrieNode *node, size_t lastPos, size_t pos) {
223 tailcall:
224 if (vec.empty())
225 return;
226
227 // Partition items so that items in [0, i) are less than the pivot,
228 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
229 // the pivot.
230 const Symbol *pivotSymbol = vec[vec.size() / 2];
231 int pivot = charAt(pivotSymbol, pos);
232 size_t i = 0;
233 size_t j = vec.size();
234 for (size_t k = 0; k < j;) {
235 int c = charAt(vec[k], pos);
236 if (c < pivot)
237 std::swap(vec[i++], vec[k++]);
238 else if (c > pivot)
239 std::swap(vec[--j], vec[k]);
240 else
241 k++;
242 }
243
244 bool isTerminal = pivot == -1;
245 bool prefixesDiverge = i != 0 || j != vec.size();
246 if (lastPos != pos && (isTerminal || prefixesDiverge)) {
247 TrieNode *newNode = makeNode();
248 node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
249 newNode);
250 node = newNode;
251 lastPos = pos;
252 }
253
254 sortAndBuild(vec.slice(0, i), node, lastPos, pos);
255 sortAndBuild(vec.slice(j), node, lastPos, pos);
256
257 if (isTerminal) {
258 assert(j - i == 1); // no duplicate symbols
259 node->info = ExportInfo(*pivotSymbol, imageBase);
260 } else {
261 // This is the tail-call-optimized version of the following:
262 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
263 vec = vec.slice(i, j - i);
264 ++pos;
265 goto tailcall;
266 }
267 }
268
build()269 size_t TrieBuilder::build() {
270 if (exported.empty())
271 return 0;
272
273 TrieNode *root = makeNode();
274 sortAndBuild(exported, root, 0, 0);
275
276 // Assign each node in the vector an offset in the trie stream, iterating
277 // until all uleb128 sizes have stabilized.
278 size_t offset;
279 bool more;
280 do {
281 offset = 0;
282 more = false;
283 for (TrieNode *node : nodes)
284 more |= node->updateOffset(offset);
285 } while (more);
286
287 return offset;
288 }
289
writeTo(uint8_t * buf) const290 void TrieBuilder::writeTo(uint8_t *buf) const {
291 for (TrieNode *node : nodes)
292 node->writeTo(buf);
293 }
294
295 namespace {
296
297 // Parse a serialized trie and invoke a callback for each entry.
298 class TrieParser {
299 public:
TrieParser(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)300 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
301 : start(buf), end(start + size), callback(callback) {}
302
303 void parse(const uint8_t *buf, const Twine &cumulativeString);
304
parse()305 void parse() { parse(start, ""); }
306
307 const uint8_t *start;
308 const uint8_t *end;
309 const TrieEntryCallback &callback;
310 };
311
312 } // namespace
313
parse(const uint8_t * buf,const Twine & cumulativeString)314 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
315 if (buf >= end)
316 fatal("Node offset points outside export section");
317
318 unsigned ulebSize;
319 uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
320 buf += ulebSize;
321 uint64_t flags = 0;
322 size_t offset;
323 if (terminalSize != 0) {
324 flags = decodeULEB128(buf, &ulebSize);
325 callback(cumulativeString, flags);
326 }
327 buf += terminalSize;
328 uint8_t numEdges = *buf++;
329 for (uint8_t i = 0; i < numEdges; ++i) {
330 const char *cbuf = reinterpret_cast<const char *>(buf);
331 StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
332 buf += substring.size() + 1;
333 offset = decodeULEB128(buf, &ulebSize);
334 buf += ulebSize;
335 parse(start + offset, cumulativeString + substring);
336 }
337 }
338
parseTrie(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)339 void macho::parseTrie(const uint8_t *buf, size_t size,
340 const TrieEntryCallback &callback) {
341 if (size == 0)
342 return;
343
344 TrieParser(buf, size, callback).parse();
345 }
346