1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2021 Jason King
15 */
16
17 #include <errno.h>
18 #include <libcustr.h>
19 #include <limits.h>
20 #include <string.h>
21 #include <stdio.h>
22
23 #include "rust.h"
24
25 /*
26 * Unfortunately, there is currently no official specification for the legacy
27 * rust name mangling. This is an attempt to document the understanding of the
28 * mangling used here. It is based off examination of
29 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
30 *
31 * A mangled rust name is:
32 * <prefix> <name>
33 *
34 * <prefix> ::= _Z
35 * __Z
36 *
37 * <name> ::= N <name-segment>+ [<hash>] E
38 *
39 * <name-segment> ::= <len> <name-chars>{len}
40 *
41 * <len> ::= [1-9][0-9]+
42 *
43 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>*
44 * <separator>
45 * <special>
46 *
47 * <separator> ::= '..' # '::'
48 *
49 * <special> ::= $SP$ # '@'
50 * $BP$ # '*'
51 * $RF$ # '&'
52 * $LT$ # '<'
53 * $GT$ # '>'
54 * $LP$ # '('
55 * $RP$ # ')'
56 * $C$ # ','
57 *
58 * <hash> := <len> h <hex-digits>+
59 *
60 * <hex-digits> := <[0-9a-f]>
61 */
62
63 static const struct rust_charmap {
64 const char *ruc_seq;
65 char ruc_ch;
66 } rust_charmap[] = {
67 { "$SP$", '@' },
68 { "$BP$", '*' },
69 { "$RF$", '&' },
70 { "$LT$", '<' },
71 { "$GT$", '>' },
72 { "$LP$", '(' },
73 { "$RP$", ')' },
74 { "$C$", ',' },
75 };
76 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
77
78 static boolean_t rustleg_valid_sym(const strview_t *);
79 static boolean_t rustleg_parse_name(rust_state_t *, strview_t *);
80 static boolean_t rustleg_parse_hash(rust_state_t *, strview_t *);
81 static boolean_t rustleg_parse_special(rust_state_t *, strview_t *);
82 static boolean_t rustleg_add_sep(rust_state_t *);
83
84 boolean_t
rust_demangle_legacy(rust_state_t * restrict st,strview_t * restrict sv)85 rust_demangle_legacy(rust_state_t *restrict st, strview_t *restrict sv)
86 {
87
88 /* Make sure the whole thing contains valid characters */
89 if (!rustleg_valid_sym(sv)) {
90 st->rs_error = EINVAL;
91 return (B_FALSE);
92 }
93
94 if (sv_peek(sv, -1) != 'E') {
95 DEMDEBUG("ERROR: string does not end with 'E'");
96 st->rs_error = EINVAL;
97 return (B_FALSE);
98 }
99
100 if (!rustleg_parse_name(st, sv))
101 return (B_FALSE);
102
103 if (sv_remaining(sv) != 0) {
104 DEMDEBUG("ERROR: trailing characters in name");
105 st->rs_error = EINVAL;
106 return (B_FALSE);
107 }
108
109 return (B_TRUE);
110 }
111
112 static boolean_t
rustleg_parse_name_segment(rust_state_t * st,strview_t * svp,boolean_t first)113 rustleg_parse_name_segment(rust_state_t *st, strview_t *svp, boolean_t first)
114 {
115 strview_t orig;
116 strview_t name;
117 uint64_t len;
118 size_t rem;
119 boolean_t last = B_FALSE;
120
121 if (HAS_ERROR(st) || sv_remaining(svp) == 0)
122 return (B_FALSE);
123
124 sv_init_sv(&orig, svp);
125
126 if (!rust_parse_base10(st, svp, &len)) {
127 DEMDEBUG("ERROR: no leading length");
128 st->rs_error = EINVAL;
129 return (B_FALSE);
130 }
131
132 rem = sv_remaining(svp);
133
134 if (rem < len) {
135 DEMDEBUG("ERROR: segment length (%" PRIu64 ") > remaining "
136 "bytes in string (%zu)", len, rem);
137 st->rs_error = EINVAL;
138 return (B_FALSE);
139 }
140
141 /* Is this the last segment before the terminating E? */
142 if (rem == len + 1) {
143 VERIFY3U(sv_peek(svp, -1), ==, 'E');
144 last = B_TRUE;
145 }
146
147 if (!first && !rustleg_add_sep(st))
148 return (B_FALSE);
149
150 /* Reduce length of seg to the length we parsed */
151 (void) sv_init_sv_range(&name, svp, len);
152
153 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
154
155 /*
156 * A rust hash starts with 'h', and is the last component of a name
157 * before the terminating 'E'. It is however not always present
158 * in every mangled symbol, and a last segment that starts with 'h'
159 * could be confused for it, so failing to part it just means
160 * we don't have a trailing hash.
161 */
162 if (sv_peek(&name, 0) == 'h' && last) {
163 if (rustleg_parse_hash(st, &name))
164 goto done;
165
166 /*
167 * However any error other than 'not a hash' (e.g. ENOMEM)
168 * means we should fail.
169 */
170 if (st->rs_error != 0)
171 goto done;
172 }
173
174 /* A '_' followed by $ is ignored at the start of a name segment */
175 if (sv_peek(&name, 0) == '_' && sv_peek(&name, 1) == '$')
176 (void) sv_consume_n(&name, 1);
177
178 while (sv_remaining(&name) > 0) {
179 switch (sv_peek(&name, 0)) {
180 case '$':
181 if (rustleg_parse_special(st, &name))
182 continue;
183 break;
184 case '.':
185 /* Convert '..' to '::' */
186 if (sv_peek(&name, 1) != '.')
187 break;
188
189 if (!rustleg_add_sep(st))
190 return (B_FALSE);
191
192 sv_consume_n(&name, 2);
193 continue;
194 default:
195 break;
196 }
197
198 if (!rust_appendc(st, sv_consume_c(&name))) {
199 SET_ERROR(st);
200 return (B_FALSE);
201 }
202 }
203
204 done:
205 sv_consume_n(svp, len);
206
207 VERIFY3P(orig.sv_first, <=, svp->sv_first);
208 DEMDEBUG("%s: consumed '%.*s'", __func__,
209 (int)(uintptr_t)(svp->sv_first - orig.sv_first), orig.sv_first);
210 return (B_TRUE);
211 }
212
213 /*
214 * Parse N (<num><name>{num})+ [<num>h<hex digits]E
215 */
216 static boolean_t
rustleg_parse_name(rust_state_t * st,strview_t * svp)217 rustleg_parse_name(rust_state_t *st, strview_t *svp)
218 {
219 strview_t name;
220 boolean_t first = B_TRUE;
221
222 sv_init_sv(&name, svp);
223
224 if (HAS_ERROR(st))
225 return (B_FALSE);
226
227 DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name));
228
229 if (sv_remaining(svp) == 0) {
230 DEMDEBUG("%s: empty name", __func__);
231 return (B_FALSE);
232 }
233
234 if (!sv_consume_if_c(svp, 'N')) {
235 DEMDEBUG("%s: does not start with 'N'", __func__);
236 return (B_FALSE);
237 }
238
239 while (sv_remaining(svp) > 0 && sv_peek(svp, 0) != 'E') {
240 if (!rustleg_parse_name_segment(st, svp, first))
241 return (B_FALSE);
242 first = B_FALSE;
243 }
244
245 if (!sv_consume_if_c(svp, 'E')) {
246 DEMDEBUG("%s: ERROR no terminating 'E'", __func__);
247 return (B_FALSE);
248 }
249
250 VERIFY3P(name.sv_first, <=, svp->sv_first);
251 DEMDEBUG("%s: consumed '%.*s'", __func__,
252 (int)(uintptr_t)(svp->sv_first - name.sv_first), name.sv_first);
253
254 return (B_TRUE);
255 }
256
257 static boolean_t
rustleg_parse_hash(rust_state_t * st,strview_t * svp)258 rustleg_parse_hash(rust_state_t *st, strview_t *svp)
259 {
260 if (HAS_ERROR(st))
261 return (B_FALSE);
262
263 VERIFY(sv_consume_if_c(svp, 'h'));
264 if (!rust_appendc(st, 'h'))
265 return (B_FALSE);
266
267 while (sv_remaining(svp) > 0) {
268 char c = sv_consume_c(svp);
269
270 switch (c) {
271 /*
272 * The upper-case hex digits (A-F) are excluded as valid
273 * hash values for several reasons:
274 *
275 * 1. It would result in two different possible names for
276 * the same function, leading to ambiguity in linking (among
277 * other things).
278 *
279 * 2. It would cause potential ambiguity in parsing -- is a
280 * trailing 'E' part of the hash, or the terminating character
281 * in the mangled name?
282 *
283 * 3. No examples were able to be found in the wild where
284 * uppercase digits are used, and other rust demanglers all
285 * seem to assume the hash must contain lower-case hex digits.
286 */
287 case '0': case '1': case '2': case '3':
288 case '4': case '5': case '6': case '7':
289 case '8': case '9': case 'a': case 'b':
290 case 'c': case 'd': case 'e': case 'f':
291 if (!rust_appendc(st, c))
292 return (B_FALSE);
293 break;
294 default:
295 return (B_FALSE);
296 }
297 }
298
299 return (B_TRUE);
300 }
301
302 static boolean_t
rustleg_parse_special(rust_state_t * restrict st,strview_t * restrict svp)303 rustleg_parse_special(rust_state_t *restrict st, strview_t *restrict svp)
304 {
305 if (HAS_ERROR(st))
306 return (B_FALSE);
307
308 if (sv_peek(svp, 0) != '$')
309 return (B_FALSE);
310
311 for (size_t i = 0; i < rust_charmap_sz; i++) {
312 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
313 if (!rust_appendc(st, rust_charmap[i].ruc_ch))
314 return (B_FALSE);
315 return (B_TRUE);
316 }
317 }
318
319 /* Handle $uXXXX$ */
320
321 strview_t sv;
322 uint32_t val = 0;
323 uint_t ndigits = 0;
324
325 sv_init_sv(&sv, svp);
326
327 /* We peeked at this earlier, so it should still be there */
328 VERIFY(sv_consume_if_c(&sv, '$'));
329
330 if (!sv_consume_if_c(&sv, 'u'))
331 return (B_FALSE);
332
333 while (sv_remaining(&sv) > 0) {
334 uint32_t cval = 0;
335 char c;
336
337 if (ndigits == 4)
338 return (B_FALSE);
339
340 c = sv_consume_c(&sv);
341 if (c >= '0' && c <= '9')
342 cval = c - '0';
343 else if (c >= 'a' && c <= 'f')
344 cval = c - 'a' + 10;
345 else if (c == '$')
346 break;
347 else
348 return (B_FALSE);
349
350 val <<= 4;
351 val |= cval;
352 ndigits++;
353 }
354
355 if (!rust_append_utf8_c(st, val))
356 return (B_FALSE);
357
358 sv_consume_n(svp, ndigits + 3);
359 return (B_TRUE);
360 }
361
362 static boolean_t
rustleg_add_sep(rust_state_t * st)363 rustleg_add_sep(rust_state_t *st)
364 {
365 if (HAS_ERROR(st))
366 return (B_FALSE);
367
368 return (rust_append(st, "::"));
369 }
370
371 static boolean_t
rustleg_valid_sym(const strview_t * sv)372 rustleg_valid_sym(const strview_t *sv)
373 {
374 size_t i;
375
376 for (i = 0; i < sv->sv_rem; i++) {
377 char c = sv->sv_first[i];
378
379 if ((c & 0x80) == 0)
380 continue;
381 DEMDEBUG("%s: ERROR found 8-bit character '%c' in '%.*s' "
382 "at index %zu", __func__, c, SV_PRINT(sv), i);
383 return (B_FALSE);
384 }
385 return (B_TRUE);
386 }
387