1*5ffd83dbSDimitry Andric //===- ExportTrie.cpp -----------------------------------------------------===// 2*5ffd83dbSDimitry Andric // 3*5ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*5ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*5ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*5ffd83dbSDimitry Andric // 7*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 8*5ffd83dbSDimitry Andric // 9*5ffd83dbSDimitry Andric // This is a partial implementation of the Mach-O export trie format. It's 10*5ffd83dbSDimitry Andric // essentially a symbol table encoded as a compressed prefix trie, meaning that 11*5ffd83dbSDimitry Andric // the common prefixes of each symbol name are shared for a more compact 12*5ffd83dbSDimitry Andric // representation. The prefixes are stored on the edges of the trie, and one 13*5ffd83dbSDimitry Andric // edge can represent multiple characters. For example, given two exported 14*5ffd83dbSDimitry Andric // symbols _bar and _baz, we will have a trie like this (terminal nodes are 15*5ffd83dbSDimitry Andric // marked with an asterisk): 16*5ffd83dbSDimitry Andric // 17*5ffd83dbSDimitry Andric // +-+-+ 18*5ffd83dbSDimitry Andric // | | // root node 19*5ffd83dbSDimitry Andric // +-+-+ 20*5ffd83dbSDimitry Andric // | 21*5ffd83dbSDimitry Andric // | _ba 22*5ffd83dbSDimitry Andric // | 23*5ffd83dbSDimitry Andric // +-+-+ 24*5ffd83dbSDimitry Andric // | | 25*5ffd83dbSDimitry Andric // +-+-+ 26*5ffd83dbSDimitry Andric // r / \ z 27*5ffd83dbSDimitry Andric // / \ 28*5ffd83dbSDimitry Andric // +-+-+ +-+-+ 29*5ffd83dbSDimitry Andric // | * | | * | 30*5ffd83dbSDimitry Andric // +-+-+ +-+-+ 31*5ffd83dbSDimitry Andric // 32*5ffd83dbSDimitry Andric // More documentation of the format can be found in 33*5ffd83dbSDimitry Andric // llvm/tools/obj2yaml/macho2yaml.cpp. 34*5ffd83dbSDimitry Andric // 35*5ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 36*5ffd83dbSDimitry Andric 37*5ffd83dbSDimitry Andric #include "ExportTrie.h" 38*5ffd83dbSDimitry Andric #include "Symbols.h" 39*5ffd83dbSDimitry Andric 40*5ffd83dbSDimitry Andric #include "lld/Common/ErrorHandler.h" 41*5ffd83dbSDimitry Andric #include "lld/Common/Memory.h" 42*5ffd83dbSDimitry Andric #include "llvm/ADT/Optional.h" 43*5ffd83dbSDimitry Andric #include "llvm/BinaryFormat/MachO.h" 44*5ffd83dbSDimitry Andric #include "llvm/Support/LEB128.h" 45*5ffd83dbSDimitry Andric 46*5ffd83dbSDimitry Andric using namespace llvm; 47*5ffd83dbSDimitry Andric using namespace llvm::MachO; 48*5ffd83dbSDimitry Andric using namespace lld; 49*5ffd83dbSDimitry Andric using namespace lld::macho; 50*5ffd83dbSDimitry Andric 51*5ffd83dbSDimitry Andric namespace { 52*5ffd83dbSDimitry Andric 53*5ffd83dbSDimitry Andric struct Edge { 54*5ffd83dbSDimitry Andric Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} 55*5ffd83dbSDimitry Andric 56*5ffd83dbSDimitry Andric StringRef substring; 57*5ffd83dbSDimitry Andric struct TrieNode *child; 58*5ffd83dbSDimitry Andric }; 59*5ffd83dbSDimitry Andric 60*5ffd83dbSDimitry Andric struct ExportInfo { 61*5ffd83dbSDimitry Andric uint64_t address; 62*5ffd83dbSDimitry Andric // TODO: Add proper support for re-exports & stub-and-resolver flags. 63*5ffd83dbSDimitry Andric }; 64*5ffd83dbSDimitry Andric 65*5ffd83dbSDimitry Andric } // namespace 66*5ffd83dbSDimitry Andric 67*5ffd83dbSDimitry Andric struct macho::TrieNode { 68*5ffd83dbSDimitry Andric std::vector<Edge> edges; 69*5ffd83dbSDimitry Andric Optional<ExportInfo> info; 70*5ffd83dbSDimitry Andric // Estimated offset from the start of the serialized trie to the current node. 71*5ffd83dbSDimitry Andric // This will converge to the true offset when updateOffset() is run to a 72*5ffd83dbSDimitry Andric // fixpoint. 73*5ffd83dbSDimitry Andric size_t offset = 0; 74*5ffd83dbSDimitry Andric 75*5ffd83dbSDimitry Andric // Returns whether the new estimated offset differs from the old one. 76*5ffd83dbSDimitry Andric bool updateOffset(size_t &nextOffset); 77*5ffd83dbSDimitry Andric void writeTo(uint8_t *buf) const; 78*5ffd83dbSDimitry Andric }; 79*5ffd83dbSDimitry Andric 80*5ffd83dbSDimitry Andric bool TrieNode::updateOffset(size_t &nextOffset) { 81*5ffd83dbSDimitry Andric // Size of the whole node (including the terminalSize and the outgoing edges.) 82*5ffd83dbSDimitry Andric // In contrast, terminalSize only records the size of the other data in the 83*5ffd83dbSDimitry Andric // node. 84*5ffd83dbSDimitry Andric size_t nodeSize; 85*5ffd83dbSDimitry Andric if (info) { 86*5ffd83dbSDimitry Andric uint64_t flags = 0; 87*5ffd83dbSDimitry Andric uint32_t terminalSize = 88*5ffd83dbSDimitry Andric getULEB128Size(flags) + getULEB128Size(info->address); 89*5ffd83dbSDimitry Andric // Overall node size so far is the uleb128 size of the length of the symbol 90*5ffd83dbSDimitry Andric // info + the symbol info itself. 91*5ffd83dbSDimitry Andric nodeSize = terminalSize + getULEB128Size(terminalSize); 92*5ffd83dbSDimitry Andric } else { 93*5ffd83dbSDimitry Andric nodeSize = 1; // Size of terminalSize (which has a value of 0) 94*5ffd83dbSDimitry Andric } 95*5ffd83dbSDimitry Andric // Compute size of all child edges. 96*5ffd83dbSDimitry Andric ++nodeSize; // Byte for number of children. 97*5ffd83dbSDimitry Andric for (Edge &edge : edges) { 98*5ffd83dbSDimitry Andric nodeSize += edge.substring.size() + 1 // String length. 99*5ffd83dbSDimitry Andric + getULEB128Size(edge.child->offset); // Offset len. 100*5ffd83dbSDimitry Andric } 101*5ffd83dbSDimitry Andric // On input, 'nextOffset' is the new preferred location for this node. 102*5ffd83dbSDimitry Andric bool result = (offset != nextOffset); 103*5ffd83dbSDimitry Andric // Store new location in node object for use by parents. 104*5ffd83dbSDimitry Andric offset = nextOffset; 105*5ffd83dbSDimitry Andric nextOffset += nodeSize; 106*5ffd83dbSDimitry Andric return result; 107*5ffd83dbSDimitry Andric } 108*5ffd83dbSDimitry Andric 109*5ffd83dbSDimitry Andric void TrieNode::writeTo(uint8_t *buf) const { 110*5ffd83dbSDimitry Andric buf += offset; 111*5ffd83dbSDimitry Andric if (info) { 112*5ffd83dbSDimitry Andric // TrieNodes with Symbol info: size, flags address 113*5ffd83dbSDimitry Andric uint64_t flags = 0; // TODO: emit proper flags 114*5ffd83dbSDimitry Andric uint32_t terminalSize = 115*5ffd83dbSDimitry Andric getULEB128Size(flags) + getULEB128Size(info->address); 116*5ffd83dbSDimitry Andric buf += encodeULEB128(terminalSize, buf); 117*5ffd83dbSDimitry Andric buf += encodeULEB128(flags, buf); 118*5ffd83dbSDimitry Andric buf += encodeULEB128(info->address, buf); 119*5ffd83dbSDimitry Andric } else { 120*5ffd83dbSDimitry Andric // TrieNode with no Symbol info. 121*5ffd83dbSDimitry Andric *buf++ = 0; // terminalSize 122*5ffd83dbSDimitry Andric } 123*5ffd83dbSDimitry Andric // Add number of children. TODO: Handle case where we have more than 256. 124*5ffd83dbSDimitry Andric assert(edges.size() < 256); 125*5ffd83dbSDimitry Andric *buf++ = edges.size(); 126*5ffd83dbSDimitry Andric // Append each child edge substring and node offset. 127*5ffd83dbSDimitry Andric for (const Edge &edge : edges) { 128*5ffd83dbSDimitry Andric memcpy(buf, edge.substring.data(), edge.substring.size()); 129*5ffd83dbSDimitry Andric buf += edge.substring.size(); 130*5ffd83dbSDimitry Andric *buf++ = '\0'; 131*5ffd83dbSDimitry Andric buf += encodeULEB128(edge.child->offset, buf); 132*5ffd83dbSDimitry Andric } 133*5ffd83dbSDimitry Andric } 134*5ffd83dbSDimitry Andric 135*5ffd83dbSDimitry Andric TrieNode *TrieBuilder::makeNode() { 136*5ffd83dbSDimitry Andric auto *node = make<TrieNode>(); 137*5ffd83dbSDimitry Andric nodes.emplace_back(node); 138*5ffd83dbSDimitry Andric return node; 139*5ffd83dbSDimitry Andric } 140*5ffd83dbSDimitry Andric 141*5ffd83dbSDimitry Andric static int charAt(const Symbol *sym, size_t pos) { 142*5ffd83dbSDimitry Andric StringRef str = sym->getName(); 143*5ffd83dbSDimitry Andric if (pos >= str.size()) 144*5ffd83dbSDimitry Andric return -1; 145*5ffd83dbSDimitry Andric return str[pos]; 146*5ffd83dbSDimitry Andric } 147*5ffd83dbSDimitry Andric 148*5ffd83dbSDimitry Andric // Build the trie by performing a three-way radix quicksort: We start by sorting 149*5ffd83dbSDimitry Andric // the strings by their first characters, then sort the strings with the same 150*5ffd83dbSDimitry Andric // first characters by their second characters, and so on recursively. Each 151*5ffd83dbSDimitry Andric // time the prefixes diverge, we add a node to the trie. 152*5ffd83dbSDimitry Andric // 153*5ffd83dbSDimitry Andric // node: The most recently created node along this path in the trie (i.e. 154*5ffd83dbSDimitry Andric // the furthest from the root.) 155*5ffd83dbSDimitry Andric // lastPos: The prefix length of the most recently created node, i.e. the number 156*5ffd83dbSDimitry Andric // of characters along its path from the root. 157*5ffd83dbSDimitry Andric // pos: The string index we are currently sorting on. Note that each symbol 158*5ffd83dbSDimitry Andric // S contained in vec has the same prefix S[0...pos). 159*5ffd83dbSDimitry Andric void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, 160*5ffd83dbSDimitry Andric TrieNode *node, size_t lastPos, size_t pos) { 161*5ffd83dbSDimitry Andric tailcall: 162*5ffd83dbSDimitry Andric if (vec.empty()) 163*5ffd83dbSDimitry Andric return; 164*5ffd83dbSDimitry Andric 165*5ffd83dbSDimitry Andric // Partition items so that items in [0, i) are less than the pivot, 166*5ffd83dbSDimitry Andric // [i, j) are the same as the pivot, and [j, vec.size()) are greater than 167*5ffd83dbSDimitry Andric // the pivot. 168*5ffd83dbSDimitry Andric const Symbol *pivotSymbol = vec[vec.size() / 2]; 169*5ffd83dbSDimitry Andric int pivot = charAt(pivotSymbol, pos); 170*5ffd83dbSDimitry Andric size_t i = 0; 171*5ffd83dbSDimitry Andric size_t j = vec.size(); 172*5ffd83dbSDimitry Andric for (size_t k = 0; k < j;) { 173*5ffd83dbSDimitry Andric int c = charAt(vec[k], pos); 174*5ffd83dbSDimitry Andric if (c < pivot) 175*5ffd83dbSDimitry Andric std::swap(vec[i++], vec[k++]); 176*5ffd83dbSDimitry Andric else if (c > pivot) 177*5ffd83dbSDimitry Andric std::swap(vec[--j], vec[k]); 178*5ffd83dbSDimitry Andric else 179*5ffd83dbSDimitry Andric k++; 180*5ffd83dbSDimitry Andric } 181*5ffd83dbSDimitry Andric 182*5ffd83dbSDimitry Andric bool isTerminal = pivot == -1; 183*5ffd83dbSDimitry Andric bool prefixesDiverge = i != 0 || j != vec.size(); 184*5ffd83dbSDimitry Andric if (lastPos != pos && (isTerminal || prefixesDiverge)) { 185*5ffd83dbSDimitry Andric TrieNode *newNode = makeNode(); 186*5ffd83dbSDimitry Andric node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), 187*5ffd83dbSDimitry Andric newNode); 188*5ffd83dbSDimitry Andric node = newNode; 189*5ffd83dbSDimitry Andric lastPos = pos; 190*5ffd83dbSDimitry Andric } 191*5ffd83dbSDimitry Andric 192*5ffd83dbSDimitry Andric sortAndBuild(vec.slice(0, i), node, lastPos, pos); 193*5ffd83dbSDimitry Andric sortAndBuild(vec.slice(j), node, lastPos, pos); 194*5ffd83dbSDimitry Andric 195*5ffd83dbSDimitry Andric if (isTerminal) { 196*5ffd83dbSDimitry Andric assert(j - i == 1); // no duplicate symbols 197*5ffd83dbSDimitry Andric node->info = {pivotSymbol->getVA()}; 198*5ffd83dbSDimitry Andric } else { 199*5ffd83dbSDimitry Andric // This is the tail-call-optimized version of the following: 200*5ffd83dbSDimitry Andric // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); 201*5ffd83dbSDimitry Andric vec = vec.slice(i, j - i); 202*5ffd83dbSDimitry Andric ++pos; 203*5ffd83dbSDimitry Andric goto tailcall; 204*5ffd83dbSDimitry Andric } 205*5ffd83dbSDimitry Andric } 206*5ffd83dbSDimitry Andric 207*5ffd83dbSDimitry Andric size_t TrieBuilder::build() { 208*5ffd83dbSDimitry Andric if (exported.empty()) 209*5ffd83dbSDimitry Andric return 0; 210*5ffd83dbSDimitry Andric 211*5ffd83dbSDimitry Andric TrieNode *root = makeNode(); 212*5ffd83dbSDimitry Andric sortAndBuild(exported, root, 0, 0); 213*5ffd83dbSDimitry Andric 214*5ffd83dbSDimitry Andric // Assign each node in the vector an offset in the trie stream, iterating 215*5ffd83dbSDimitry Andric // until all uleb128 sizes have stabilized. 216*5ffd83dbSDimitry Andric size_t offset; 217*5ffd83dbSDimitry Andric bool more; 218*5ffd83dbSDimitry Andric do { 219*5ffd83dbSDimitry Andric offset = 0; 220*5ffd83dbSDimitry Andric more = false; 221*5ffd83dbSDimitry Andric for (TrieNode *node : nodes) 222*5ffd83dbSDimitry Andric more |= node->updateOffset(offset); 223*5ffd83dbSDimitry Andric } while (more); 224*5ffd83dbSDimitry Andric 225*5ffd83dbSDimitry Andric return offset; 226*5ffd83dbSDimitry Andric } 227*5ffd83dbSDimitry Andric 228*5ffd83dbSDimitry Andric void TrieBuilder::writeTo(uint8_t *buf) const { 229*5ffd83dbSDimitry Andric for (TrieNode *node : nodes) 230*5ffd83dbSDimitry Andric node->writeTo(buf); 231*5ffd83dbSDimitry Andric } 232*5ffd83dbSDimitry Andric 233*5ffd83dbSDimitry Andric namespace { 234*5ffd83dbSDimitry Andric 235*5ffd83dbSDimitry Andric // Parse a serialized trie and invoke a callback for each entry. 236*5ffd83dbSDimitry Andric class TrieParser { 237*5ffd83dbSDimitry Andric public: 238*5ffd83dbSDimitry Andric TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) 239*5ffd83dbSDimitry Andric : start(buf), end(start + size), callback(callback) {} 240*5ffd83dbSDimitry Andric 241*5ffd83dbSDimitry Andric void parse(const uint8_t *buf, const Twine &cumulativeString); 242*5ffd83dbSDimitry Andric 243*5ffd83dbSDimitry Andric void parse() { parse(start, ""); } 244*5ffd83dbSDimitry Andric 245*5ffd83dbSDimitry Andric const uint8_t *start; 246*5ffd83dbSDimitry Andric const uint8_t *end; 247*5ffd83dbSDimitry Andric const TrieEntryCallback &callback; 248*5ffd83dbSDimitry Andric }; 249*5ffd83dbSDimitry Andric 250*5ffd83dbSDimitry Andric } // namespace 251*5ffd83dbSDimitry Andric 252*5ffd83dbSDimitry Andric void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { 253*5ffd83dbSDimitry Andric if (buf >= end) 254*5ffd83dbSDimitry Andric fatal("Node offset points outside export section"); 255*5ffd83dbSDimitry Andric 256*5ffd83dbSDimitry Andric unsigned ulebSize; 257*5ffd83dbSDimitry Andric uint64_t terminalSize = decodeULEB128(buf, &ulebSize); 258*5ffd83dbSDimitry Andric buf += ulebSize; 259*5ffd83dbSDimitry Andric uint64_t flags = 0; 260*5ffd83dbSDimitry Andric size_t offset; 261*5ffd83dbSDimitry Andric if (terminalSize != 0) { 262*5ffd83dbSDimitry Andric flags = decodeULEB128(buf, &ulebSize); 263*5ffd83dbSDimitry Andric callback(cumulativeString, flags); 264*5ffd83dbSDimitry Andric } 265*5ffd83dbSDimitry Andric buf += terminalSize; 266*5ffd83dbSDimitry Andric uint8_t numEdges = *buf++; 267*5ffd83dbSDimitry Andric for (uint8_t i = 0; i < numEdges; ++i) { 268*5ffd83dbSDimitry Andric const char *cbuf = reinterpret_cast<const char *>(buf); 269*5ffd83dbSDimitry Andric StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf)); 270*5ffd83dbSDimitry Andric buf += substring.size() + 1; 271*5ffd83dbSDimitry Andric offset = decodeULEB128(buf, &ulebSize); 272*5ffd83dbSDimitry Andric buf += ulebSize; 273*5ffd83dbSDimitry Andric parse(start + offset, cumulativeString + substring); 274*5ffd83dbSDimitry Andric } 275*5ffd83dbSDimitry Andric } 276*5ffd83dbSDimitry Andric 277*5ffd83dbSDimitry Andric void macho::parseTrie(const uint8_t *buf, size_t size, 278*5ffd83dbSDimitry Andric const TrieEntryCallback &callback) { 279*5ffd83dbSDimitry Andric if (size == 0) 280*5ffd83dbSDimitry Andric return; 281*5ffd83dbSDimitry Andric 282*5ffd83dbSDimitry Andric TrieParser(buf, size, callback).parse(); 283*5ffd83dbSDimitry Andric } 284