xref: /illumos-gate/usr/src/lib/libdemangle/common/rust.c (revision 4fcce4872b9846a3c40d70c0de66142c56585c73)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  */
15 
16 #include <errno.h>
17 #include <libcustr.h>
18 #include <limits.h>
19 #include <string.h>
20 #include <sys/ctype.h>	/* We want the C locale ISXXX() versions */
21 #include <sys/debug.h>
22 #include <stdio.h>
23 #include <sys/sysmacros.h>
24 
25 #include "strview.h"
26 #include "demangle_int.h"
27 
28 /*
29  * Unfortunately, there is currently no official specification for the rust
30  * name mangling.  This is an attempt to document the understanding of the
31  * mangling used here.  It is based off examination of
32  *     https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
33  *
34  * A mangled rust name is:
35  *     <prefix> <name> <hash> E
36  *
37  * <prefix>	::=	_Z
38  *			__Z
39  *
40  * <name>	::= <name-segment>+
41  *
42  * <name-segment> ::= <len> <name-chars>{len}
43  *
44  * <len>	::= [1-9][0-9]+
45  *
46  * <name-chars>	::=	<[A-Za-z]> <[A-Za-z0-9]>*
47  *			<separator>
48  *			<special>
49  *
50  * <separator>	::=	'..'	# '::'
51  *
52  * <special>	::=	$SP$	# ' '
53  *			$BP$	# '*'
54  *			$RF$	# '&'
55  *			$LT$	# '<'
56  *			$GT$	# '>'
57  *			$LP$	# '('
58  *			$RP$	# ')'
59  *			$C$	# ','
60  *			$u7e$	# '~'
61  *			$u20$	# ' '
62  *			$u27$	# '\''
63  *			$u3d$	# '='
64  *			$u5b$	# '['
65  *			$u5d$	# ']'
66  *			$u7b$	# '{'
67  *			$u7d$	# '}'
68  *			$u3b$	# ';'
69  *			$u2b$	# '+'
70  *			$u22$	# '"'
71  *
72  * <hash>	:= <len> h <hex-digits>+
73  *
74  * <hex-digits>	:= <[0-9a-f]>
75  */
76 
77 typedef struct rustdem_state {
78 	const char	*rds_str;
79 	custr_t		*rds_demangled;
80 	sysdem_ops_t	*rds_ops;
81 	int		rds_error;
82 } rustdem_state_t;
83 
84 static const struct rust_charmap {
85 	const char	*ruc_seq;
86 	char		ruc_ch;
87 } rust_charmap[] = {
88 	{ "$SP$", '@' },
89 	{ "$BP$", '*' },
90 	{ "$RF$", '&' },
91 	{ "$LT$", '<' },
92 	{ "$GT$", '>' },
93 	{ "$LP$", '(' },
94 	{ "$RP$", ')' },
95 	{ "$C$", ',' },
96 	{ "$u7e$", '~' },
97 	{ "$u20$", ' ' },
98 	{ "$u27$", '\'' },
99 	{ "$u3d$", '=' },
100 	{ "$u5b$", '[' },
101 	{ "$u5d$", ']' },
102 	{ "$u7b$", '{' },
103 	{ "$u7d$", '}' },
104 	{ "$u3b$", ';' },
105 	{ "$u2b$", '+' },
106 	{ "$u22$", '"' }
107 };
108 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
109 
110 static void *rustdem_alloc(custr_alloc_t *, size_t);
111 static void rustdem_free(custr_alloc_t *, void *, size_t);
112 
113 static boolean_t rustdem_append_c(rustdem_state_t *, char);
114 static boolean_t rustdem_all_ascii(const strview_t *);
115 
116 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
117 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
118 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
119 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
120 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
121 static boolean_t rustdem_add_sep(rustdem_state_t *);
122 
123 char *
124 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
125 {
126 	rustdem_state_t st = {
127 		.rds_str = s,
128 		.rds_ops = ops,
129 	};
130 	custr_alloc_ops_t custr_ops = {
131 		.custr_ao_alloc = rustdem_alloc,
132 		.custr_ao_free = rustdem_free
133 	};
134 	custr_alloc_t custr_alloc = {
135 		.cua_version = CUSTR_VERSION
136 	};
137 	strview_t sv;
138 	int ret;
139 
140 	if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
141 		return (NULL);
142 	custr_alloc.cua_arg = &st;
143 
144 	sv_init_str(&sv, s, s + slen);
145 
146 	if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
147 		DEMDEBUG("ERROR: string is either too small or does not end "
148 		    "with 'E'");
149 		errno = EINVAL;
150 		return (NULL);
151 	}
152 
153 	if (!rustdem_parse_prefix(&st, &sv)) {
154 		DEMDEBUG("ERROR: could not parse prefix");
155 		errno = EINVAL;
156 		return (NULL);
157 	}
158 	DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
159 
160 	if (!rustdem_all_ascii(&sv)) {
161 		/* rustdem_all_ascii() provides debug output */
162 		errno = EINVAL;
163 		return (NULL);
164 	}
165 
166 	if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
167 		return (NULL);
168 
169 	while (sv_remaining(&sv) > 1) {
170 		if (rustdem_parse_name(&st, &sv))
171 			continue;
172 		if (st.rds_error != 0)
173 			goto fail;
174 	}
175 
176 	if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E'))
177 		goto fail;
178 
179 	char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
180 	if (res == NULL) {
181 		st.rds_error = errno;
182 		goto fail;
183 	}
184 
185 	custr_free(st.rds_demangled);
186 	DEMDEBUG("result = '%s'", res);
187 	return (res);
188 
189 fail:
190 	custr_free(st.rds_demangled);
191 	errno = st.rds_error;
192 	return (NULL);
193 }
194 
195 static boolean_t
196 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
197 {
198 	strview_t pfx;
199 
200 	sv_init_sv(&pfx, svp);
201 
202 	DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx));
203 
204 	if (st->rds_error != 0)
205 		return (B_FALSE);
206 
207 	if (!sv_consume_if_c(&pfx, '_'))
208 		return (B_FALSE);
209 
210 	(void) sv_consume_if_c(&pfx, '_');
211 
212 	if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N'))
213 		return (B_FALSE);
214 
215 	/* Update svp with new position */
216 	sv_init_sv(svp, &pfx);
217 	return (B_TRUE);
218 }
219 
220 static boolean_t
221 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
222 {
223 	strview_t sv;
224 	strview_t name;
225 	uint64_t len;
226 	size_t rem;
227 	boolean_t last = B_FALSE;
228 
229 	if (st->rds_error != 0 || sv_remaining(svp) == 0)
230 		return (B_FALSE);
231 
232 	sv_init_sv(&sv, svp);
233 
234 	if (!rustdem_parse_num(st, &sv, &len)) {
235 		DEMDEBUG("ERROR: no leading length");
236 		st->rds_error = EINVAL;
237 		return (B_FALSE);
238 	}
239 
240 	rem = sv_remaining(&sv);
241 
242 	if (rem < len) {
243 		st->rds_error = EINVAL;
244 		return (B_FALSE);
245 	}
246 
247 	/* Is this the last segment before the terminating E? */
248 	if (rem == len + 1) {
249 		VERIFY3U(sv_peek(&sv, -1), ==, 'E');
250 		last = B_TRUE;
251 	}
252 
253 	if (!first && !rustdem_add_sep(st))
254 		return (B_FALSE);
255 
256 	/* Reduce length of seg to the length we parsed */
257 	(void) sv_init_sv_range(&name, &sv, len);
258 
259 	DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
260 
261 	/*
262 	 * A rust hash starts with 'h', and is the last component of a name
263 	 * before the terminating 'E'
264 	 */
265 	if (sv_peek(&name, 0) == 'h' && last) {
266 		if (!rustdem_parse_hash(st, &name))
267 			return (B_FALSE);
268 		goto done;
269 	}
270 
271 	while (sv_remaining(&name) > 0) {
272 		switch (sv_peek(&name, 0)) {
273 		case '$':
274 			if (rustdem_parse_special(st, &name))
275 				continue;
276 			break;
277 		case '_':
278 			if (sv_peek(&name, 1) == '$') {
279 				/*
280 				 * Only consume/ignore '_'.  Leave
281 				 * $ for next round.
282 				 */
283 				sv_consume_n(&name, 1);
284 				continue;
285 			}
286 			break;
287 		case '.':
288 			/* Convert '..' to '::' */
289 			if (sv_peek(&name, 1) != '.')
290 				break;
291 
292 			if (!rustdem_add_sep(st))
293 				return (B_FALSE);
294 
295 			sv_consume_n(&name, 2);
296 			continue;
297 		default:
298 			break;
299 		}
300 
301 		if (custr_appendc(st->rds_demangled,
302 		    sv_consume_c(&name)) != 0) {
303 			st->rds_error = ENOMEM;
304 			return (B_FALSE);
305 		}
306 	}
307 
308 done:
309 	DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first);
310 	sv_consume_n(&sv, len);
311 	sv_init_sv(svp, &sv);
312 	return (B_TRUE);
313 }
314 
315 static boolean_t
316 rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
317 {
318 	strview_t name;
319 	boolean_t first = B_TRUE;
320 
321 	if (st->rds_error != 0)
322 		return (B_FALSE);
323 
324 	sv_init_sv(&name, svp);
325 
326 	if (sv_remaining(&name) == 0)
327 		return (B_FALSE);
328 
329 	while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
330 		if (!rustdem_parse_name_segment(st, &name, first))
331 			return (B_FALSE);
332 		first = B_FALSE;
333 	}
334 
335 	sv_init_sv(svp, &name);
336 	return (B_TRUE);
337 }
338 
339 static boolean_t
340 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
341 {
342 	strview_t sv;
343 
344 	sv_init_sv(&sv, svp);
345 
346 	VERIFY(sv_consume_if_c(&sv, 'h'));
347 	if (!rustdem_append_c(st, 'h'))
348 		return (B_FALSE);
349 
350 	while (sv_remaining(&sv) > 0) {
351 		char c = sv_consume_c(&sv);
352 
353 		switch (c) {
354 		/*
355 		 * The upper-case hex digits (A-F) are excluded as valid
356 		 * hash values for several reasons:
357 		 *
358 		 * 1. It would result in two different possible names for
359 		 * the same function, leading to ambiguity in linking (among
360 		 * other things).
361 		 *
362 		 * 2. It would cause potential ambiguity in parsing -- is a
363 		 * trailing 'E' part of the hash, or the terminating character
364 		 * in the mangled name?
365 		 *
366 		 * 3. No examples were able to be found in the wild where
367 		 * uppercase digits are used, and other rust demanglers all
368 		 * seem to assume the hash must contain lower-case hex digits.
369 		 */
370 		case '0': case '1': case '2': case '3':
371 		case '4': case '5': case '6': case '7':
372 		case '8': case '9': case 'a': case 'b':
373 		case 'c': case 'd': case 'e': case 'f':
374 			if (!rustdem_append_c(st, c))
375 				return (B_FALSE);
376 			break;
377 		default:
378 			return (B_FALSE);
379 		}
380 	}
381 
382 	sv_init_sv(svp, &sv);
383 	return (B_TRUE);
384 }
385 
386 /*
387  * We have to pick an arbitrary limit here; 999,999,999 fits comfortably
388  * within an int32_t, so let's go with that, as it seems unlikely we'd
389  * ever see a larger value in context.
390  */
391 #define	MAX_DIGITS 9
392 
393 static boolean_t
394 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
395     uint64_t *restrict valp)
396 {
397 	strview_t snum;
398 	uint64_t v = 0;
399 	size_t ndigits = 0;
400 	char c;
401 
402 	if (st->rds_error != 0)
403 		return (B_FALSE);
404 
405 	sv_init_sv(&snum, svp);
406 
407 	DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
408 
409 	c = sv_peek(&snum, 0);
410 	if (!ISDIGIT(c)) {
411 		DEMDEBUG("%s: ERROR no digits in str\n", __func__);
412 		st->rds_error = EINVAL;
413 		return (B_FALSE);
414 	}
415 
416 	/*
417 	 * Since there is currently no official specification on rust name
418 	 * mangling, only that it has been stated that rust follows what
419 	 * C++ mangling does.  In the Itanium C++ ABI (what practically
420 	 * every non-Windows C++ implementation uses these days), it
421 	 * explicitly disallows leading 0s in numeric values (except for
422 	 * substition and template indexes, which aren't relevant here).
423 	 * We enforce the same restriction -- if a rust implementation allowed
424 	 * leading zeros in numbers (basically segment lengths) it'd
425 	 * cause all sorts of ambiguity problems with names that likely lead
426 	 * to much bigger problems with linking and such, so this seems
427 	 * reasonable.
428 	 */
429 	if (c == '0') {
430 		DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
431 		st->rds_error = EINVAL;
432 		return (B_FALSE);
433 	}
434 
435 	while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
436 		c = sv_consume_c(&snum);
437 
438 		if (!ISDIGIT(c))
439 			break;
440 
441 		v *= 10;
442 		v += c - '0';
443 		ndigits++;
444 	}
445 
446 	if (ndigits > MAX_DIGITS) {
447 		DEMDEBUG("%s: value %llu is too large\n", __func__, v);
448 		st->rds_error = ERANGE;
449 		return (B_FALSE);
450 	}
451 
452 	DEMDEBUG("%s: num=%llu", __func__, v);
453 
454 	*valp = v;
455 	sv_consume_n(svp, ndigits);
456 	return (B_TRUE);
457 }
458 
459 static boolean_t
460 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
461 {
462 	if (st->rds_error != 0)
463 		return (B_FALSE);
464 
465 	if (sv_peek(svp, 0) != '$')
466 		return (B_FALSE);
467 
468 	for (size_t i = 0; i < rust_charmap_sz; i++) {
469 		if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
470 			if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
471 				return (B_FALSE);
472 			return (B_TRUE);
473 		}
474 	}
475 	return (B_FALSE);
476 }
477 
478 static boolean_t
479 rustdem_add_sep(rustdem_state_t *st)
480 {
481 	if (st->rds_error != 0)
482 		return (B_FALSE);
483 
484 	if (!rustdem_append_c(st, ':') ||
485 	    !rustdem_append_c(st, ':'))
486 		return (B_FALSE);
487 
488 	return (B_TRUE);
489 }
490 
491 static boolean_t
492 rustdem_append_c(rustdem_state_t *st, char c)
493 {
494 	if (st->rds_error != 0)
495 		return (B_FALSE);
496 
497 	if (custr_appendc(st->rds_demangled, c) == 0)
498 		return (B_TRUE);
499 
500 	st->rds_error = errno;
501 	return (B_FALSE);
502 }
503 
504 static boolean_t
505 rustdem_all_ascii(const strview_t *svp)
506 {
507 	strview_t p;
508 
509 	sv_init_sv(&p, svp);
510 
511 	while (sv_remaining(&p) > 0) {
512 		char c = sv_consume_c(&p);
513 
514 		/*
515 		 * #including <sys/ctype.h> conflicts with <ctype.h>.  Since
516 		 * we want the C locale macros (ISDIGIT, etc), it also means
517 		 * we can't use isascii(3C).
518 		 */
519 		if ((c & 0x80) != 0) {
520 			DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
521 			    "offset %tu", __func__, c,
522 			    (ptrdiff_t)(p.sv_first - svp->sv_first));
523 			return (B_FALSE);
524 		}
525 	}
526 	return (B_TRUE);
527 }
528 
529 static void *
530 rustdem_alloc(custr_alloc_t *cao, size_t len)
531 {
532 	rustdem_state_t *st = cao->cua_arg;
533 	return (zalloc(st->rds_ops, len));
534 }
535 
536 static void
537 rustdem_free(custr_alloc_t *cao, void *p, size_t len)
538 {
539 	rustdem_state_t *st = cao->cua_arg;
540 	xfree(st->rds_ops, p, len);
541 }
542