1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2021 Jason King 15 */ 16 17 #include <errno.h> 18 #include <libcustr.h> 19 #include <limits.h> 20 #include <string.h> 21 #include <stdio.h> 22 23 #include "rust.h" 24 25 /* 26 * Unfortunately, there is currently no official specification for the legacy 27 * rust name mangling. This is an attempt to document the understanding of the 28 * mangling used here. It is based off examination of 29 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/ 30 * 31 * A mangled rust name is: 32 * <prefix> <name> 33 * 34 * <prefix> ::= _Z 35 * __Z 36 * 37 * <name> ::= N <name-segment>+ [<hash>] E 38 * 39 * <name-segment> ::= <len> <name-chars>{len} 40 * 41 * <len> ::= [1-9][0-9]+ 42 * 43 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>* 44 * <separator> 45 * <special> 46 * 47 * <separator> ::= '..' # '::' 48 * 49 * <special> ::= $SP$ # '@' 50 * $BP$ # '*' 51 * $RF$ # '&' 52 * $LT$ # '<' 53 * $GT$ # '>' 54 * $LP$ # '(' 55 * $RP$ # ')' 56 * $C$ # ',' 57 * 58 * <hash> := <len> h <hex-digits>+ 59 * 60 * <hex-digits> := <[0-9a-f]> 61 */ 62 63 static const struct rust_charmap { 64 const char *ruc_seq; 65 char ruc_ch; 66 } rust_charmap[] = { 67 { "$SP$", '@' }, 68 { "$BP$", '*' }, 69 { "$RF$", '&' }, 70 { "$LT$", '<' }, 71 { "$GT$", '>' }, 72 { "$LP$", '(' }, 73 { "$RP$", ')' }, 74 { "$C$", ',' }, 75 }; 76 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap); 77 78 static boolean_t rustleg_valid_sym(const strview_t *); 79 static boolean_t rustleg_parse_name(rust_state_t *, strview_t *); 80 static boolean_t rustleg_parse_hash(rust_state_t *, strview_t *); 81 static boolean_t rustleg_parse_special(rust_state_t *, strview_t *); 82 static boolean_t rustleg_add_sep(rust_state_t *); 83 84 boolean_t 85 rust_demangle_legacy(rust_state_t *restrict st, strview_t *restrict sv) 86 { 87 88 /* Make sure the whole thing contains valid characters */ 89 if (!rustleg_valid_sym(sv)) { 90 st->rs_error = EINVAL; 91 return (B_FALSE); 92 } 93 94 if (sv_peek(sv, -1) != 'E') { 95 DEMDEBUG("ERROR: string does not end with 'E'"); 96 st->rs_error = EINVAL; 97 return (B_FALSE); 98 } 99 100 if (!rustleg_parse_name(st, sv)) 101 return (B_FALSE); 102 103 if (sv_remaining(sv) != 0) { 104 DEMDEBUG("ERROR: trailing characters in name"); 105 st->rs_error = EINVAL; 106 return (B_FALSE); 107 } 108 109 return (B_TRUE); 110 } 111 112 static boolean_t 113 rustleg_parse_name_segment(rust_state_t *st, strview_t *svp, boolean_t first) 114 { 115 strview_t orig; 116 strview_t name; 117 uint64_t len; 118 size_t rem; 119 boolean_t last = B_FALSE; 120 121 if (HAS_ERROR(st) || sv_remaining(svp) == 0) 122 return (B_FALSE); 123 124 sv_init_sv(&orig, svp); 125 126 if (!rust_parse_base10(st, svp, &len)) { 127 DEMDEBUG("ERROR: no leading length"); 128 st->rs_error = EINVAL; 129 return (B_FALSE); 130 } 131 132 rem = sv_remaining(svp); 133 134 if (rem < len) { 135 DEMDEBUG("ERROR: segment length (%" PRIu64 ") > remaining " 136 "bytes in string (%zu)", len, rem); 137 st->rs_error = EINVAL; 138 return (B_FALSE); 139 } 140 141 /* Is this the last segment before the terminating E? */ 142 if (rem == len + 1) { 143 VERIFY3U(sv_peek(svp, -1), ==, 'E'); 144 last = B_TRUE; 145 } 146 147 if (!first && !rustleg_add_sep(st)) 148 return (B_FALSE); 149 150 /* Reduce length of seg to the length we parsed */ 151 (void) sv_init_sv_range(&name, svp, len); 152 153 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name)); 154 155 /* 156 * A rust hash starts with 'h', and is the last component of a name 157 * before the terminating 'E'. It is however not always present 158 * in every mangled symbol, and a last segment that starts with 'h' 159 * could be confused for it, so failing to part it just means 160 * we don't have a trailing hash. 161 */ 162 if (sv_peek(&name, 0) == 'h' && last) { 163 if (rustleg_parse_hash(st, &name)) 164 goto done; 165 166 /* 167 * However any error other than 'not a hash' (e.g. ENOMEM) 168 * means we should fail. 169 */ 170 if (st->rs_error != 0) 171 goto done; 172 } 173 174 /* A '_' followed by $ is ignored at the start of a name segment */ 175 if (sv_peek(&name, 0) == '_' && sv_peek(&name, 1) == '$') 176 (void) sv_consume_n(&name, 1); 177 178 while (sv_remaining(&name) > 0) { 179 switch (sv_peek(&name, 0)) { 180 case '$': 181 if (rustleg_parse_special(st, &name)) 182 continue; 183 break; 184 case '.': 185 /* Convert '..' to '::' */ 186 if (sv_peek(&name, 1) != '.') 187 break; 188 189 if (!rustleg_add_sep(st)) 190 return (B_FALSE); 191 192 sv_consume_n(&name, 2); 193 continue; 194 default: 195 break; 196 } 197 198 if (!rust_appendc(st, sv_consume_c(&name))) { 199 SET_ERROR(st); 200 return (B_FALSE); 201 } 202 } 203 204 done: 205 sv_consume_n(svp, len); 206 207 VERIFY3P(orig.sv_first, <=, svp->sv_first); 208 DEMDEBUG("%s: consumed '%.*s'", __func__, 209 (int)(uintptr_t)(svp->sv_first - orig.sv_first), orig.sv_first); 210 return (B_TRUE); 211 } 212 213 /* 214 * Parse N (<num><name>{num})+ [<num>h<hex digits]E 215 */ 216 static boolean_t 217 rustleg_parse_name(rust_state_t *st, strview_t *svp) 218 { 219 strview_t name; 220 boolean_t first = B_TRUE; 221 222 sv_init_sv(&name, svp); 223 224 if (HAS_ERROR(st)) 225 return (B_FALSE); 226 227 DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name)); 228 229 if (sv_remaining(svp) == 0) { 230 DEMDEBUG("%s: empty name", __func__); 231 return (B_FALSE); 232 } 233 234 if (!sv_consume_if_c(svp, 'N')) { 235 DEMDEBUG("%s: does not start with 'N'", __func__); 236 return (B_FALSE); 237 } 238 239 while (sv_remaining(svp) > 0 && sv_peek(svp, 0) != 'E') { 240 if (!rustleg_parse_name_segment(st, svp, first)) 241 return (B_FALSE); 242 first = B_FALSE; 243 } 244 245 if (!sv_consume_if_c(svp, 'E')) { 246 DEMDEBUG("%s: ERROR no terminating 'E'", __func__); 247 return (B_FALSE); 248 } 249 250 VERIFY3P(name.sv_first, <=, svp->sv_first); 251 DEMDEBUG("%s: consumed '%.*s'", __func__, 252 (int)(uintptr_t)(svp->sv_first - name.sv_first), name.sv_first); 253 254 return (B_TRUE); 255 } 256 257 static boolean_t 258 rustleg_parse_hash(rust_state_t *st, strview_t *svp) 259 { 260 if (HAS_ERROR(st)) 261 return (B_FALSE); 262 263 VERIFY(sv_consume_if_c(svp, 'h')); 264 if (!rust_appendc(st, 'h')) 265 return (B_FALSE); 266 267 while (sv_remaining(svp) > 0) { 268 char c = sv_consume_c(svp); 269 270 switch (c) { 271 /* 272 * The upper-case hex digits (A-F) are excluded as valid 273 * hash values for several reasons: 274 * 275 * 1. It would result in two different possible names for 276 * the same function, leading to ambiguity in linking (among 277 * other things). 278 * 279 * 2. It would cause potential ambiguity in parsing -- is a 280 * trailing 'E' part of the hash, or the terminating character 281 * in the mangled name? 282 * 283 * 3. No examples were able to be found in the wild where 284 * uppercase digits are used, and other rust demanglers all 285 * seem to assume the hash must contain lower-case hex digits. 286 */ 287 case '0': case '1': case '2': case '3': 288 case '4': case '5': case '6': case '7': 289 case '8': case '9': case 'a': case 'b': 290 case 'c': case 'd': case 'e': case 'f': 291 if (!rust_appendc(st, c)) 292 return (B_FALSE); 293 break; 294 default: 295 return (B_FALSE); 296 } 297 } 298 299 return (B_TRUE); 300 } 301 302 static boolean_t 303 rustleg_parse_special(rust_state_t *restrict st, strview_t *restrict svp) 304 { 305 if (HAS_ERROR(st)) 306 return (B_FALSE); 307 308 if (sv_peek(svp, 0) != '$') 309 return (B_FALSE); 310 311 for (size_t i = 0; i < rust_charmap_sz; i++) { 312 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) { 313 if (!rust_appendc(st, rust_charmap[i].ruc_ch)) 314 return (B_FALSE); 315 return (B_TRUE); 316 } 317 } 318 319 /* Handle $uXXXX$ */ 320 321 strview_t sv; 322 uint32_t val = 0; 323 uint_t ndigits = 0; 324 325 sv_init_sv(&sv, svp); 326 327 /* We peeked at this earlier, so it should still be there */ 328 VERIFY(sv_consume_if_c(&sv, '$')); 329 330 if (!sv_consume_if_c(&sv, 'u')) 331 return (B_FALSE); 332 333 while (sv_remaining(&sv) > 0) { 334 uint32_t cval = 0; 335 char c; 336 337 if (ndigits == 4) 338 return (B_FALSE); 339 340 c = sv_consume_c(&sv); 341 if (c >= '0' && c <= '9') 342 cval = c - '0'; 343 else if (c >= 'a' && c <= 'f') 344 cval = c - 'a' + 10; 345 else if (c == '$') 346 break; 347 else 348 return (B_FALSE); 349 350 val <<= 4; 351 val |= cval; 352 ndigits++; 353 } 354 355 if (!rust_append_utf8_c(st, val)) 356 return (B_FALSE); 357 358 sv_consume_n(svp, ndigits + 3); 359 return (B_TRUE); 360 } 361 362 static boolean_t 363 rustleg_add_sep(rust_state_t *st) 364 { 365 if (HAS_ERROR(st)) 366 return (B_FALSE); 367 368 return (rust_append(st, "::")); 369 } 370 371 static boolean_t 372 rustleg_valid_sym(const strview_t *sv) 373 { 374 size_t i; 375 376 for (i = 0; i < sv->sv_rem; i++) { 377 char c = sv->sv_first[i]; 378 379 if ((c & 0x80) == 0) 380 continue; 381 DEMDEBUG("%s: ERROR found 8-bit character '%c' in '%.*s' " 382 "at index %zu", __func__, c, SV_PRINT(sv), i); 383 return (B_FALSE); 384 } 385 return (B_TRUE); 386 } 387