1 //===- ExportTrie.cpp -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is a partial implementation of the Mach-O export trie format. It's 10 // essentially a symbol table encoded as a compressed prefix trie, meaning that 11 // the common prefixes of each symbol name are shared for a more compact 12 // representation. The prefixes are stored on the edges of the trie, and one 13 // edge can represent multiple characters. For example, given two exported 14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are 15 // marked with an asterisk): 16 // 17 // +-+-+ 18 // | | // root node 19 // +-+-+ 20 // | 21 // | _ba 22 // | 23 // +-+-+ 24 // | | 25 // +-+-+ 26 // r / \ z 27 // / \ 28 // +-+-+ +-+-+ 29 // | * | | * | 30 // +-+-+ +-+-+ 31 // 32 // More documentation of the format can be found in 33 // llvm/tools/obj2yaml/macho2yaml.cpp. 34 // 35 //===----------------------------------------------------------------------===// 36 37 #include "ExportTrie.h" 38 #include "Symbols.h" 39 40 #include "lld/Common/ErrorHandler.h" 41 #include "lld/Common/Memory.h" 42 #include "llvm/BinaryFormat/MachO.h" 43 #include "llvm/Support/LEB128.h" 44 #include <optional> 45 46 using namespace llvm; 47 using namespace lld; 48 using namespace lld::macho; 49 50 namespace { 51 52 struct Edge { 53 Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} 54 55 StringRef substring; 56 struct TrieNode *child; 57 }; 58 59 struct ExportInfo { 60 uint64_t address; 61 uint8_t flags = 0; 62 ExportInfo(const Symbol &sym, uint64_t imageBase) 63 : address(sym.getVA() - imageBase) { 64 using namespace llvm::MachO; 65 // Set the symbol type. 66 if (sym.isWeakDef()) 67 flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; 68 // TODO: Add proper support for re-exports & stub-and-resolver flags. 69 70 // Set the symbol kind. 71 if (sym.isTlv()) { 72 flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; 73 } else if (auto *defined = dyn_cast<Defined>(&sym)) { 74 if (defined->isAbsolute()) 75 flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; 76 } 77 } 78 }; 79 80 } // namespace 81 82 struct macho::TrieNode { 83 std::vector<Edge> edges; 84 std::optional<ExportInfo> info; 85 // Estimated offset from the start of the serialized trie to the current node. 86 // This will converge to the true offset when updateOffset() is run to a 87 // fixpoint. 88 size_t offset = 0; 89 90 // Returns whether the new estimated offset differs from the old one. 91 bool updateOffset(size_t &nextOffset); 92 void writeTo(uint8_t *buf) const; 93 }; 94 95 bool TrieNode::updateOffset(size_t &nextOffset) { 96 // Size of the whole node (including the terminalSize and the outgoing edges.) 97 // In contrast, terminalSize only records the size of the other data in the 98 // node. 99 size_t nodeSize; 100 if (info) { 101 uint32_t terminalSize = 102 getULEB128Size(info->flags) + getULEB128Size(info->address); 103 // Overall node size so far is the uleb128 size of the length of the symbol 104 // info + the symbol info itself. 105 nodeSize = terminalSize + getULEB128Size(terminalSize); 106 } else { 107 nodeSize = 1; // Size of terminalSize (which has a value of 0) 108 } 109 // Compute size of all child edges. 110 ++nodeSize; // Byte for number of children. 111 for (const Edge &edge : edges) { 112 nodeSize += edge.substring.size() + 1 // String length. 113 + getULEB128Size(edge.child->offset); // Offset len. 114 } 115 // On input, 'nextOffset' is the new preferred location for this node. 116 bool result = (offset != nextOffset); 117 // Store new location in node object for use by parents. 118 offset = nextOffset; 119 nextOffset += nodeSize; 120 return result; 121 } 122 123 void TrieNode::writeTo(uint8_t *buf) const { 124 buf += offset; 125 if (info) { 126 // TrieNodes with Symbol info: size, flags address 127 uint32_t terminalSize = 128 getULEB128Size(info->flags) + getULEB128Size(info->address); 129 buf += encodeULEB128(terminalSize, buf); 130 buf += encodeULEB128(info->flags, buf); 131 buf += encodeULEB128(info->address, buf); 132 } else { 133 // TrieNode with no Symbol info. 134 *buf++ = 0; // terminalSize 135 } 136 // Add number of children. TODO: Handle case where we have more than 256. 137 assert(edges.size() < 256); 138 *buf++ = edges.size(); 139 // Append each child edge substring and node offset. 140 for (const Edge &edge : edges) { 141 memcpy(buf, edge.substring.data(), edge.substring.size()); 142 buf += edge.substring.size(); 143 *buf++ = '\0'; 144 buf += encodeULEB128(edge.child->offset, buf); 145 } 146 } 147 148 TrieBuilder::~TrieBuilder() { 149 for (TrieNode *node : nodes) 150 delete node; 151 } 152 153 TrieNode *TrieBuilder::makeNode() { 154 auto *node = new TrieNode(); 155 nodes.emplace_back(node); 156 return node; 157 } 158 159 static int charAt(const Symbol *sym, size_t pos) { 160 StringRef str = sym->getName(); 161 if (pos >= str.size()) 162 return -1; 163 return str[pos]; 164 } 165 166 // Build the trie by performing a three-way radix quicksort: We start by sorting 167 // the strings by their first characters, then sort the strings with the same 168 // first characters by their second characters, and so on recursively. Each 169 // time the prefixes diverge, we add a node to the trie. 170 // 171 // node: The most recently created node along this path in the trie (i.e. 172 // the furthest from the root.) 173 // lastPos: The prefix length of the most recently created node, i.e. the number 174 // of characters along its path from the root. 175 // pos: The string index we are currently sorting on. Note that each symbol 176 // S contained in vec has the same prefix S[0...pos). 177 void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, 178 TrieNode *node, size_t lastPos, size_t pos) { 179 tailcall: 180 if (vec.empty()) 181 return; 182 183 // Partition items so that items in [0, i) are less than the pivot, 184 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than 185 // the pivot. 186 const Symbol *pivotSymbol = vec[vec.size() / 2]; 187 int pivot = charAt(pivotSymbol, pos); 188 size_t i = 0; 189 size_t j = vec.size(); 190 for (size_t k = 0; k < j;) { 191 int c = charAt(vec[k], pos); 192 if (c < pivot) 193 std::swap(vec[i++], vec[k++]); 194 else if (c > pivot) 195 std::swap(vec[--j], vec[k]); 196 else 197 k++; 198 } 199 200 bool isTerminal = pivot == -1; 201 bool prefixesDiverge = i != 0 || j != vec.size(); 202 if (lastPos != pos && (isTerminal || prefixesDiverge)) { 203 TrieNode *newNode = makeNode(); 204 node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), 205 newNode); 206 node = newNode; 207 lastPos = pos; 208 } 209 210 sortAndBuild(vec.slice(0, i), node, lastPos, pos); 211 sortAndBuild(vec.slice(j), node, lastPos, pos); 212 213 if (isTerminal) { 214 assert(j - i == 1); // no duplicate symbols 215 node->info = ExportInfo(*pivotSymbol, imageBase); 216 } else { 217 // This is the tail-call-optimized version of the following: 218 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); 219 vec = vec.slice(i, j - i); 220 ++pos; 221 goto tailcall; 222 } 223 } 224 225 size_t TrieBuilder::build() { 226 if (exported.empty()) 227 return 0; 228 229 TrieNode *root = makeNode(); 230 sortAndBuild(exported, root, 0, 0); 231 232 // Assign each node in the vector an offset in the trie stream, iterating 233 // until all uleb128 sizes have stabilized. 234 size_t offset; 235 bool more; 236 do { 237 offset = 0; 238 more = false; 239 for (TrieNode *node : nodes) 240 more |= node->updateOffset(offset); 241 } while (more); 242 243 return offset; 244 } 245 246 void TrieBuilder::writeTo(uint8_t *buf) const { 247 for (TrieNode *node : nodes) 248 node->writeTo(buf); 249 } 250 251 namespace { 252 253 // Parse a serialized trie and invoke a callback for each entry. 254 class TrieParser { 255 public: 256 TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) 257 : start(buf), end(start + size), callback(callback) {} 258 259 void parse(const uint8_t *buf, const Twine &cumulativeString); 260 261 void parse() { parse(start, ""); } 262 263 const uint8_t *start; 264 const uint8_t *end; 265 const TrieEntryCallback &callback; 266 }; 267 268 } // namespace 269 270 void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { 271 if (buf >= end) 272 fatal("Node offset points outside export section"); 273 274 unsigned ulebSize; 275 uint64_t terminalSize = decodeULEB128(buf, &ulebSize); 276 buf += ulebSize; 277 uint64_t flags = 0; 278 size_t offset; 279 if (terminalSize != 0) { 280 flags = decodeULEB128(buf, &ulebSize); 281 callback(cumulativeString, flags); 282 } 283 buf += terminalSize; 284 uint8_t numEdges = *buf++; 285 for (uint8_t i = 0; i < numEdges; ++i) { 286 const char *cbuf = reinterpret_cast<const char *>(buf); 287 StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf)); 288 buf += substring.size() + 1; 289 offset = decodeULEB128(buf, &ulebSize); 290 buf += ulebSize; 291 parse(start + offset, cumulativeString + substring); 292 } 293 } 294 295 void macho::parseTrie(const uint8_t *buf, size_t size, 296 const TrieEntryCallback &callback) { 297 if (size == 0) 298 return; 299 300 TrieParser(buf, size, callback).parse(); 301 } 302