1 //===- ExportTrie.cpp -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is a partial implementation of the Mach-O export trie format. It's 10 // essentially a symbol table encoded as a compressed prefix trie, meaning that 11 // the common prefixes of each symbol name are shared for a more compact 12 // representation. The prefixes are stored on the edges of the trie, and one 13 // edge can represent multiple characters. For example, given two exported 14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are 15 // marked with an asterisk): 16 // 17 // +-+-+ 18 // | | // root node 19 // +-+-+ 20 // | 21 // | _ba 22 // | 23 // +-+-+ 24 // | | 25 // +-+-+ 26 // r / \ z 27 // / \ 28 // +-+-+ +-+-+ 29 // | * | | * | 30 // +-+-+ +-+-+ 31 // 32 // More documentation of the format can be found in 33 // llvm/tools/obj2yaml/macho2yaml.cpp. 34 // 35 //===----------------------------------------------------------------------===// 36 37 #include "ExportTrie.h" 38 #include "Symbols.h" 39 40 #include "lld/Common/ErrorHandler.h" 41 #include "llvm/BinaryFormat/MachO.h" 42 #include "llvm/Support/LEB128.h" 43 #include <optional> 44 45 using namespace llvm; 46 using namespace lld; 47 using namespace lld::macho; 48 49 namespace { 50 51 struct Edge { 52 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} 53 54 StringRef substring; 55 struct TrieNode *child; 56 }; 57 58 struct ExportInfo { 59 uint64_t address; 60 uint64_t ordinal = 0; 61 uint8_t flags = 0; 62 ExportInfo(const Symbol &sym, uint64_t imageBase) 63 : address(sym.getVA() - imageBase) { 64 using namespace llvm::MachO; 65 if (sym.isWeakDef()) 66 flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; 67 if (sym.isTlv()) 68 flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; 69 // TODO: Add proper support for stub-and-resolver flags. 70 71 if (auto *defined = dyn_cast<Defined>(&sym)) { 72 if (defined->isAbsolute()) 73 flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; 74 } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) { 75 flags |= EXPORT_SYMBOL_FLAGS_REEXPORT; 76 if (!dysym->isDynamicLookup()) 77 ordinal = dysym->getFile()->ordinal; 78 } 79 } 80 }; 81 82 } // namespace 83 84 struct macho::TrieNode { 85 std::vector<Edge> edges; 86 std::optional<ExportInfo> info; 87 // Estimated offset from the start of the serialized trie to the current node. 88 // This will converge to the true offset when updateOffset() is run to a 89 // fixpoint. 90 size_t offset = 0; 91 92 uint32_t getTerminalSize() const; 93 // Returns whether the new estimated offset differs from the old one. 94 bool updateOffset(size_t &nextOffset); 95 void writeTo(uint8_t *buf) const; 96 }; 97 98 // For regular symbols, the node layout (excluding the children) is 99 // 100 // uleb128 terminalSize; 101 // uleb128 flags; 102 // uleb128 address; 103 // 104 // For re-exported symbols, the layout is 105 // 106 // uleb128 terminalSize; 107 // uleb128 flags; 108 // uleb128 ordinal; 109 // char[] originalName; 110 // 111 // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias 112 // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo 113 // re-exports _bar directly (i.e. not via an alias), then originalName will be 114 // the empty string. 115 // 116 // TODO: Support aliased re-exports. (Since we don't yet support these, 117 // originalName will always be the empty string.) 118 // 119 // For stub-and-resolver nodes, the layout is 120 // 121 // uleb128 terminalSize; 122 // uleb128 flags; 123 // uleb128 stubAddress; 124 // uleb128 resolverAddress; 125 // 126 // TODO: Support stub-and-resolver nodes. 127 uint32_t TrieNode::getTerminalSize() const { 128 uint32_t size = getULEB128Size(info->flags); 129 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) 130 size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator 131 else 132 size += getULEB128Size(info->address); 133 return size; 134 } 135 136 bool TrieNode::updateOffset(size_t &nextOffset) { 137 // Size of the whole node (including the terminalSize and the outgoing edges.) 138 // In contrast, terminalSize only records the size of the other data in the 139 // node. 140 size_t nodeSize; 141 if (info) { 142 uint32_t terminalSize = getTerminalSize(); 143 // Overall node size so far is the uleb128 size of the length of the symbol 144 // info + the symbol info itself. 145 nodeSize = terminalSize + getULEB128Size(terminalSize); 146 } else { 147 nodeSize = 1; // Size of terminalSize (which has a value of 0) 148 } 149 // Compute size of all child edges. 150 ++nodeSize; // Byte for number of children. 151 for (const Edge &edge : edges) { 152 nodeSize += edge.substring.size() + 1 // String length. 153 + getULEB128Size(edge.child->offset); // Offset len. 154 } 155 // On input, 'nextOffset' is the new preferred location for this node. 156 bool result = (offset != nextOffset); 157 // Store new location in node object for use by parents. 158 offset = nextOffset; 159 nextOffset += nodeSize; 160 return result; 161 } 162 163 void TrieNode::writeTo(uint8_t *buf) const { 164 buf += offset; 165 if (info) { 166 uint32_t terminalSize = getTerminalSize(); 167 buf += encodeULEB128(terminalSize, buf); 168 buf += encodeULEB128(info->flags, buf); 169 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) { 170 buf += encodeULEB128(info->ordinal, buf); 171 *buf++ = 0; // empty originalName string 172 } else { 173 buf += encodeULEB128(info->address, buf); 174 } 175 } else { 176 // TrieNode with no Symbol info. 177 *buf++ = 0; // terminalSize 178 } 179 // Add number of children. TODO: Handle case where we have more than 256. 180 assert(edges.size() < 256); 181 *buf++ = edges.size(); 182 // Append each child edge substring and node offset. 183 for (const Edge &edge : edges) { 184 memcpy(buf, edge.substring.data(), edge.substring.size()); 185 buf += edge.substring.size(); 186 *buf++ = '\0'; 187 buf += encodeULEB128(edge.child->offset, buf); 188 } 189 } 190 191 TrieBuilder::~TrieBuilder() { 192 for (TrieNode *node : nodes) 193 delete node; 194 } 195 196 TrieNode *TrieBuilder::makeNode() { 197 auto *node = new TrieNode(); 198 nodes.emplace_back(node); 199 return node; 200 } 201 202 static int charAt(const Symbol *sym, size_t pos) { 203 StringRef str = sym->getName(); 204 if (pos >= str.size()) 205 return -1; 206 return str[pos]; 207 } 208 209 // Build the trie by performing a three-way radix quicksort: We start by sorting 210 // the strings by their first characters, then sort the strings with the same 211 // first characters by their second characters, and so on recursively. Each 212 // time the prefixes diverge, we add a node to the trie. 213 // 214 // node: The most recently created node along this path in the trie (i.e. 215 // the furthest from the root.) 216 // lastPos: The prefix length of the most recently created node, i.e. the number 217 // of characters along its path from the root. 218 // pos: The string index we are currently sorting on. Note that each symbol 219 // S contained in vec has the same prefix S[0...pos). 220 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, 221 TrieNode *node, size_t lastPos, size_t pos) { 222 tailcall: 223 if (vec.empty()) 224 return; 225 226 // Partition items so that items in [0, i) are less than the pivot, 227 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than 228 // the pivot. 229 const Symbol *pivotSymbol = vec[vec.size() / 2]; 230 int pivot = charAt(pivotSymbol, pos); 231 size_t i = 0; 232 size_t j = vec.size(); 233 for (size_t k = 0; k < j;) { 234 int c = charAt(vec[k], pos); 235 if (c < pivot) 236 std::swap(vec[i++], vec[k++]); 237 else if (c > pivot) 238 std::swap(vec[--j], vec[k]); 239 else 240 k++; 241 } 242 243 bool isTerminal = pivot == -1; 244 bool prefixesDiverge = i != 0 || j != vec.size(); 245 if (lastPos != pos && (isTerminal || prefixesDiverge)) { 246 TrieNode *newNode = makeNode(); 247 node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), 248 newNode); 249 node = newNode; 250 lastPos = pos; 251 } 252 253 sortAndBuild(vec.slice(0, i), node, lastPos, pos); 254 sortAndBuild(vec.slice(j), node, lastPos, pos); 255 256 if (isTerminal) { 257 assert(j - i == 1); // no duplicate symbols 258 node->info = ExportInfo(*pivotSymbol, imageBase); 259 } else { 260 // This is the tail-call-optimized version of the following: 261 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); 262 vec = vec.slice(i, j - i); 263 ++pos; 264 goto tailcall; 265 } 266 } 267 268 size_t TrieBuilder::build() { 269 if (exported.empty()) 270 return 0; 271 272 TrieNode *root = makeNode(); 273 sortAndBuild(exported, root, 0, 0); 274 275 // Assign each node in the vector an offset in the trie stream, iterating 276 // until all uleb128 sizes have stabilized. 277 size_t offset; 278 bool more; 279 do { 280 offset = 0; 281 more = false; 282 for (TrieNode *node : nodes) 283 more |= node->updateOffset(offset); 284 } while (more); 285 286 return offset; 287 } 288 289 void TrieBuilder::writeTo(uint8_t *buf) const { 290 for (TrieNode *node : nodes) 291 node->writeTo(buf); 292 } 293 294 namespace { 295 296 // Parse a serialized trie and invoke a callback for each entry. 297 class TrieParser { 298 public: 299 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) 300 : start(buf), end(start + size), callback(callback) {} 301 302 void parse(const uint8_t *buf, const Twine &cumulativeString); 303 304 void parse() { parse(start, ""); } 305 306 const uint8_t *start; 307 const uint8_t *end; 308 const TrieEntryCallback &callback; 309 }; 310 311 } // namespace 312 313 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { 314 if (buf >= end) 315 fatal("Node offset points outside export section"); 316 317 unsigned ulebSize; 318 uint64_t terminalSize = decodeULEB128(buf, &ulebSize); 319 buf += ulebSize; 320 uint64_t flags = 0; 321 size_t offset; 322 if (terminalSize != 0) { 323 flags = decodeULEB128(buf, &ulebSize); 324 callback(cumulativeString, flags); 325 } 326 buf += terminalSize; 327 uint8_t numEdges = *buf++; 328 for (uint8_t i = 0; i < numEdges; ++i) { 329 const char *cbuf = reinterpret_cast<const char *>(buf); 330 StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf)); 331 buf += substring.size() + 1; 332 offset = decodeULEB128(buf, &ulebSize); 333 buf += ulebSize; 334 parse(start + offset, cumulativeString + substring); 335 } 336 } 337 338 void macho::parseTrie(const uint8_t *buf, size_t size, 339 const TrieEntryCallback &callback) { 340 if (size == 0) 341 return; 342 343 TrieParser(buf, size, callback).parse(); 344 } 345