1 //===- ExportTrie.cpp -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is a partial implementation of the Mach-O export trie format. It's 10 // essentially a symbol table encoded as a compressed prefix trie, meaning that 11 // the common prefixes of each symbol name are shared for a more compact 12 // representation. The prefixes are stored on the edges of the trie, and one 13 // edge can represent multiple characters. For example, given two exported 14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are 15 // marked with an asterisk): 16 // 17 // +-+-+ 18 // | | // root node 19 // +-+-+ 20 // | 21 // | _ba 22 // | 23 // +-+-+ 24 // | | 25 // +-+-+ 26 // r / \ z 27 // / \ 28 // +-+-+ +-+-+ 29 // | * | | * | 30 // +-+-+ +-+-+ 31 // 32 // More documentation of the format can be found in 33 // llvm/tools/obj2yaml/macho2yaml.cpp. 34 // 35 //===----------------------------------------------------------------------===// 36 37 #include "ExportTrie.h" 38 #include "Symbols.h" 39 40 #include "lld/Common/ErrorHandler.h" 41 #include "lld/Common/Memory.h" 42 #include "llvm/BinaryFormat/MachO.h" 43 #include "llvm/Support/LEB128.h" 44 #include <optional> 45 46 using namespace llvm; 47 using namespace lld; 48 using namespace lld::macho; 49 50 namespace { 51 52 struct Edge { 53 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} 54 55 StringRef substring; 56 struct TrieNode *child; 57 }; 58 59 struct ExportInfo { 60 uint64_t address; 61 uint64_t ordinal = 0; 62 uint8_t flags = 0; 63 ExportInfo(const Symbol &sym, uint64_t imageBase) 64 : address(sym.getVA() - imageBase) { 65 using namespace llvm::MachO; 66 if (sym.isWeakDef()) 67 flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; 68 if (sym.isTlv()) 69 flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; 70 // TODO: Add proper support for stub-and-resolver flags. 71 72 if (auto *defined = dyn_cast<Defined>(&sym)) { 73 if (defined->isAbsolute()) 74 flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; 75 } else if (auto *dysym = dyn_cast<DylibSymbol>(&sym)) { 76 flags |= EXPORT_SYMBOL_FLAGS_REEXPORT; 77 if (!dysym->isDynamicLookup()) 78 ordinal = dysym->getFile()->ordinal; 79 } 80 } 81 }; 82 83 } // namespace 84 85 struct macho::TrieNode { 86 std::vector<Edge> edges; 87 std::optional<ExportInfo> info; 88 // Estimated offset from the start of the serialized trie to the current node. 89 // This will converge to the true offset when updateOffset() is run to a 90 // fixpoint. 91 size_t offset = 0; 92 93 uint32_t getTerminalSize() const; 94 // Returns whether the new estimated offset differs from the old one. 95 bool updateOffset(size_t &nextOffset); 96 void writeTo(uint8_t *buf) const; 97 }; 98 99 // For regular symbols, the node layout (excluding the children) is 100 // 101 // uleb128 terminalSize; 102 // uleb128 flags; 103 // uleb128 address; 104 // 105 // For re-exported symbols, the layout is 106 // 107 // uleb128 terminalSize; 108 // uleb128 flags; 109 // uleb128 ordinal; 110 // char[] originalName; 111 // 112 // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias 113 // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo 114 // re-exports _bar directly (i.e. not via an alias), then originalName will be 115 // the empty string. 116 // 117 // TODO: Support aliased re-exports. (Since we don't yet support these, 118 // originalName will always be the empty string.) 119 // 120 // For stub-and-resolver nodes, the layout is 121 // 122 // uleb128 terminalSize; 123 // uleb128 flags; 124 // uleb128 stubAddress; 125 // uleb128 resolverAddress; 126 // 127 // TODO: Support stub-and-resolver nodes. 128 uint32_t TrieNode::getTerminalSize() const { 129 uint32_t size = getULEB128Size(info->flags); 130 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) 131 size += getULEB128Size(info->ordinal) + 1; // + 1 for the null-terminator 132 else 133 size += getULEB128Size(info->address); 134 return size; 135 } 136 137 bool TrieNode::updateOffset(size_t &nextOffset) { 138 // Size of the whole node (including the terminalSize and the outgoing edges.) 139 // In contrast, terminalSize only records the size of the other data in the 140 // node. 141 size_t nodeSize; 142 if (info) { 143 uint32_t terminalSize = getTerminalSize(); 144 // Overall node size so far is the uleb128 size of the length of the symbol 145 // info + the symbol info itself. 146 nodeSize = terminalSize + getULEB128Size(terminalSize); 147 } else { 148 nodeSize = 1; // Size of terminalSize (which has a value of 0) 149 } 150 // Compute size of all child edges. 151 ++nodeSize; // Byte for number of children. 152 for (const Edge &edge : edges) { 153 nodeSize += edge.substring.size() + 1 // String length. 154 + getULEB128Size(edge.child->offset); // Offset len. 155 } 156 // On input, 'nextOffset' is the new preferred location for this node. 157 bool result = (offset != nextOffset); 158 // Store new location in node object for use by parents. 159 offset = nextOffset; 160 nextOffset += nodeSize; 161 return result; 162 } 163 164 void TrieNode::writeTo(uint8_t *buf) const { 165 buf += offset; 166 if (info) { 167 uint32_t terminalSize = getTerminalSize(); 168 buf += encodeULEB128(terminalSize, buf); 169 buf += encodeULEB128(info->flags, buf); 170 if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) { 171 buf += encodeULEB128(info->ordinal, buf); 172 *buf++ = 0; // empty originalName string 173 } else { 174 buf += encodeULEB128(info->address, buf); 175 } 176 } else { 177 // TrieNode with no Symbol info. 178 *buf++ = 0; // terminalSize 179 } 180 // Add number of children. TODO: Handle case where we have more than 256. 181 assert(edges.size() < 256); 182 *buf++ = edges.size(); 183 // Append each child edge substring and node offset. 184 for (const Edge &edge : edges) { 185 memcpy(buf, edge.substring.data(), edge.substring.size()); 186 buf += edge.substring.size(); 187 *buf++ = '\0'; 188 buf += encodeULEB128(edge.child->offset, buf); 189 } 190 } 191 192 TrieBuilder::~TrieBuilder() { 193 for (TrieNode *node : nodes) 194 delete node; 195 } 196 197 TrieNode *TrieBuilder::makeNode() { 198 auto *node = new TrieNode(); 199 nodes.emplace_back(node); 200 return node; 201 } 202 203 static int charAt(const Symbol *sym, size_t pos) { 204 StringRef str = sym->getName(); 205 if (pos >= str.size()) 206 return -1; 207 return str[pos]; 208 } 209 210 // Build the trie by performing a three-way radix quicksort: We start by sorting 211 // the strings by their first characters, then sort the strings with the same 212 // first characters by their second characters, and so on recursively. Each 213 // time the prefixes diverge, we add a node to the trie. 214 // 215 // node: The most recently created node along this path in the trie (i.e. 216 // the furthest from the root.) 217 // lastPos: The prefix length of the most recently created node, i.e. the number 218 // of characters along its path from the root. 219 // pos: The string index we are currently sorting on. Note that each symbol 220 // S contained in vec has the same prefix S[0...pos). 221 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, 222 TrieNode *node, size_t lastPos, size_t pos) { 223 tailcall: 224 if (vec.empty()) 225 return; 226 227 // Partition items so that items in [0, i) are less than the pivot, 228 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than 229 // the pivot. 230 const Symbol *pivotSymbol = vec[vec.size() / 2]; 231 int pivot = charAt(pivotSymbol, pos); 232 size_t i = 0; 233 size_t j = vec.size(); 234 for (size_t k = 0; k < j;) { 235 int c = charAt(vec[k], pos); 236 if (c < pivot) 237 std::swap(vec[i++], vec[k++]); 238 else if (c > pivot) 239 std::swap(vec[--j], vec[k]); 240 else 241 k++; 242 } 243 244 bool isTerminal = pivot == -1; 245 bool prefixesDiverge = i != 0 || j != vec.size(); 246 if (lastPos != pos && (isTerminal || prefixesDiverge)) { 247 TrieNode *newNode = makeNode(); 248 node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), 249 newNode); 250 node = newNode; 251 lastPos = pos; 252 } 253 254 sortAndBuild(vec.slice(0, i), node, lastPos, pos); 255 sortAndBuild(vec.slice(j), node, lastPos, pos); 256 257 if (isTerminal) { 258 assert(j - i == 1); // no duplicate symbols 259 node->info = ExportInfo(*pivotSymbol, imageBase); 260 } else { 261 // This is the tail-call-optimized version of the following: 262 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); 263 vec = vec.slice(i, j - i); 264 ++pos; 265 goto tailcall; 266 } 267 } 268 269 size_t TrieBuilder::build() { 270 if (exported.empty()) 271 return 0; 272 273 TrieNode *root = makeNode(); 274 sortAndBuild(exported, root, 0, 0); 275 276 // Assign each node in the vector an offset in the trie stream, iterating 277 // until all uleb128 sizes have stabilized. 278 size_t offset; 279 bool more; 280 do { 281 offset = 0; 282 more = false; 283 for (TrieNode *node : nodes) 284 more |= node->updateOffset(offset); 285 } while (more); 286 287 return offset; 288 } 289 290 void TrieBuilder::writeTo(uint8_t *buf) const { 291 for (TrieNode *node : nodes) 292 node->writeTo(buf); 293 } 294 295 namespace { 296 297 // Parse a serialized trie and invoke a callback for each entry. 298 class TrieParser { 299 public: 300 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) 301 : start(buf), end(start + size), callback(callback) {} 302 303 void parse(const uint8_t *buf, const Twine &cumulativeString); 304 305 void parse() { parse(start, ""); } 306 307 const uint8_t *start; 308 const uint8_t *end; 309 const TrieEntryCallback &callback; 310 }; 311 312 } // namespace 313 314 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { 315 if (buf >= end) 316 fatal("Node offset points outside export section"); 317 318 unsigned ulebSize; 319 uint64_t terminalSize = decodeULEB128(buf, &ulebSize); 320 buf += ulebSize; 321 uint64_t flags = 0; 322 size_t offset; 323 if (terminalSize != 0) { 324 flags = decodeULEB128(buf, &ulebSize); 325 callback(cumulativeString, flags); 326 } 327 buf += terminalSize; 328 uint8_t numEdges = *buf++; 329 for (uint8_t i = 0; i < numEdges; ++i) { 330 const char *cbuf = reinterpret_cast<const char *>(buf); 331 StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf)); 332 buf += substring.size() + 1; 333 offset = decodeULEB128(buf, &ulebSize); 334 buf += ulebSize; 335 parse(start + offset, cumulativeString + substring); 336 } 337 } 338 339 void macho::parseTrie(const uint8_t *buf, size_t size, 340 const TrieEntryCallback &callback) { 341 if (size == 0) 342 return; 343 344 TrieParser(buf, size, callback).parse(); 345 } 346