xref: /illumos-gate/usr/src/lib/libdemangle/common/rust-legacy.c (revision 63f91fbc3c024870d86dc3332a4a0080fb29bc40)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2021 Jason King
15  */
16 
17 #include <errno.h>
18 #include <libcustr.h>
19 #include <limits.h>
20 #include <string.h>
21 #include <stdio.h>
22 
23 #include "rust.h"
24 
25 /*
26  * Unfortunately, there is currently no official specification for the legacy
27  * rust name mangling.  This is an attempt to document the understanding of the
28  * mangling used here.  It is based off examination of
29  *     https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
30  *
31  * A mangled rust name is:
32  *     <prefix> <name>
33  *
34  * <prefix>	::=	_Z
35  *			__Z
36  *
37  * <name>	::= N <name-segment>+ [<hash>] E
38  *
39  * <name-segment> ::= <len> <name-chars>{len}
40  *
41  * <len>	::= [1-9][0-9]+
42  *
43  * <name-chars>	::=	<[A-Za-z]> <[A-Za-z0-9]>*
44  *			<separator>
45  *			<special>
46  *
47  * <separator>	::=	'..'	# '::'
48  *
49  * <special>	::=	$SP$	# '@'
50  *			$BP$	# '*'
51  *			$RF$	# '&'
52  *			$LT$	# '<'
53  *			$GT$	# '>'
54  *			$LP$	# '('
55  *			$RP$	# ')'
56  *			$C$	# ','
57  *
58  * <hash>	:= <len> h <hex-digits>+
59  *
60  * <hex-digits>	:= <[0-9a-f]>
61  */
62 
63 static const struct rust_charmap {
64 	const char	*ruc_seq;
65 	char		ruc_ch;
66 } rust_charmap[] = {
67 	{ "$SP$", '@' },
68 	{ "$BP$", '*' },
69 	{ "$RF$", '&' },
70 	{ "$LT$", '<' },
71 	{ "$GT$", '>' },
72 	{ "$LP$", '(' },
73 	{ "$RP$", ')' },
74 	{ "$C$", ',' },
75 };
76 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
77 
78 static boolean_t rustleg_valid_sym(const strview_t *);
79 static boolean_t rustleg_parse_name(rust_state_t *, strview_t *);
80 static boolean_t rustleg_parse_hash(rust_state_t *, strview_t *);
81 static boolean_t rustleg_parse_special(rust_state_t *, strview_t *);
82 static boolean_t rustleg_add_sep(rust_state_t *);
83 
84 boolean_t
85 rust_demangle_legacy(rust_state_t *restrict st, strview_t *restrict sv)
86 {
87 
88 	/* Make sure the whole thing contains valid characters */
89 	if (!rustleg_valid_sym(sv)) {
90 		st->rs_error = EINVAL;
91 		return (B_FALSE);
92 	}
93 
94 	if (sv_peek(sv, -1) != 'E') {
95 		DEMDEBUG("ERROR: string does not end with 'E'");
96 		st->rs_error = EINVAL;
97 		return (B_FALSE);
98 	}
99 
100 	if (!rustleg_parse_name(st, sv))
101 		return (B_FALSE);
102 
103 	if (sv_remaining(sv) != 0) {
104 		DEMDEBUG("ERROR: trailing characters in name");
105 		st->rs_error = EINVAL;
106 		return (B_FALSE);
107 	}
108 
109 	return (B_TRUE);
110 }
111 
112 static boolean_t
113 rustleg_parse_name_segment(rust_state_t *st, strview_t *svp, boolean_t first)
114 {
115 	strview_t orig;
116 	strview_t name;
117 	uint64_t len;
118 	size_t rem;
119 	boolean_t last = B_FALSE;
120 
121 	if (HAS_ERROR(st) || sv_remaining(svp) == 0)
122 		return (B_FALSE);
123 
124 	sv_init_sv(&orig, svp);
125 
126 	if (!rust_parse_base10(st, svp, &len)) {
127 		DEMDEBUG("ERROR: no leading length");
128 		st->rs_error = EINVAL;
129 		return (B_FALSE);
130 	}
131 
132 	rem = sv_remaining(svp);
133 
134 	if (rem < len) {
135 		DEMDEBUG("ERROR: segment length (%" PRIu64 ") > remaining "
136 		    "bytes in string (%zu)", len, rem);
137 		st->rs_error = EINVAL;
138 		return (B_FALSE);
139 	}
140 
141 	/* Is this the last segment before the terminating E? */
142 	if (rem == len + 1) {
143 		VERIFY3U(sv_peek(svp, -1), ==, 'E');
144 		last = B_TRUE;
145 	}
146 
147 	if (!first && !rustleg_add_sep(st))
148 		return (B_FALSE);
149 
150 	/* Reduce length of seg to the length we parsed */
151 	(void) sv_init_sv_range(&name, svp, len);
152 
153 	DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
154 
155 	/*
156 	 * A rust hash starts with 'h', and is the last component of a name
157 	 * before the terminating 'E'. It is however not always present
158 	 * in every mangled symbol, and a last segment that starts with 'h'
159 	 * could be confused for it, so failing to part it just means
160 	 * we don't have a trailing hash.
161 	 */
162 	if (sv_peek(&name, 0) == 'h' && last) {
163 		if (rustleg_parse_hash(st, &name))
164 			goto done;
165 
166 		/*
167 		 * However any error other than 'not a hash' (e.g. ENOMEM)
168 		 * means we should fail.
169 		 */
170 		if (st->rs_error != 0)
171 			goto done;
172 	}
173 
174 	/* A '_' followed by $ is ignored at the start of a name segment */
175 	if (sv_peek(&name, 0) == '_' && sv_peek(&name, 1) == '$')
176 		(void) sv_consume_n(&name, 1);
177 
178 	while (sv_remaining(&name) > 0) {
179 		switch (sv_peek(&name, 0)) {
180 		case '$':
181 			if (rustleg_parse_special(st, &name))
182 				continue;
183 			break;
184 		case '.':
185 			/* Convert '..' to '::' */
186 			if (sv_peek(&name, 1) != '.')
187 				break;
188 
189 			if (!rustleg_add_sep(st))
190 				return (B_FALSE);
191 
192 			sv_consume_n(&name, 2);
193 			continue;
194 		default:
195 			break;
196 		}
197 
198 		if (!rust_appendc(st, sv_consume_c(&name))) {
199 			SET_ERROR(st);
200 			return (B_FALSE);
201 		}
202 	}
203 
204 done:
205 	sv_consume_n(svp, len);
206 
207 	VERIFY3P(orig.sv_first, <=, svp->sv_first);
208 	DEMDEBUG("%s: consumed '%.*s'", __func__,
209 	    (int)(uintptr_t)(svp->sv_first - orig.sv_first), orig.sv_first);
210 	return (B_TRUE);
211 }
212 
213 /*
214  * Parse N (<num><name>{num})+ [<num>h<hex digits]E
215  */
216 static boolean_t
217 rustleg_parse_name(rust_state_t *st, strview_t *svp)
218 {
219 	strview_t name;
220 	boolean_t first = B_TRUE;
221 
222 	sv_init_sv(&name, svp);
223 
224 	if (HAS_ERROR(st))
225 		return (B_FALSE);
226 
227 	DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name));
228 
229 	if (sv_remaining(svp) == 0) {
230 		DEMDEBUG("%s: empty name", __func__);
231 		return (B_FALSE);
232 	}
233 
234 	if (!sv_consume_if_c(svp, 'N')) {
235 		DEMDEBUG("%s: does not start with 'N'", __func__);
236 		return (B_FALSE);
237 	}
238 
239 	while (sv_remaining(svp) > 0 && sv_peek(svp, 0) != 'E') {
240 		if (!rustleg_parse_name_segment(st, svp, first))
241 			return (B_FALSE);
242 		first = B_FALSE;
243 	}
244 
245 	if (!sv_consume_if_c(svp, 'E')) {
246 		DEMDEBUG("%s: ERROR no terminating 'E'", __func__);
247 		return (B_FALSE);
248 	}
249 
250 	VERIFY3P(name.sv_first, <=, svp->sv_first);
251 	DEMDEBUG("%s: consumed '%.*s'", __func__,
252 	    (int)(uintptr_t)(svp->sv_first - name.sv_first), name.sv_first);
253 
254 	return (B_TRUE);
255 }
256 
257 static boolean_t
258 rustleg_parse_hash(rust_state_t *st, strview_t *svp)
259 {
260 	if (HAS_ERROR(st))
261 		return (B_FALSE);
262 
263 	VERIFY(sv_consume_if_c(svp, 'h'));
264 	if (!rust_appendc(st, 'h'))
265 		return (B_FALSE);
266 
267 	while (sv_remaining(svp) > 0) {
268 		char c = sv_consume_c(svp);
269 
270 		switch (c) {
271 		/*
272 		 * The upper-case hex digits (A-F) are excluded as valid
273 		 * hash values for several reasons:
274 		 *
275 		 * 1. It would result in two different possible names for
276 		 * the same function, leading to ambiguity in linking (among
277 		 * other things).
278 		 *
279 		 * 2. It would cause potential ambiguity in parsing -- is a
280 		 * trailing 'E' part of the hash, or the terminating character
281 		 * in the mangled name?
282 		 *
283 		 * 3. No examples were able to be found in the wild where
284 		 * uppercase digits are used, and other rust demanglers all
285 		 * seem to assume the hash must contain lower-case hex digits.
286 		 */
287 		case '0': case '1': case '2': case '3':
288 		case '4': case '5': case '6': case '7':
289 		case '8': case '9': case 'a': case 'b':
290 		case 'c': case 'd': case 'e': case 'f':
291 			if (!rust_appendc(st, c))
292 				return (B_FALSE);
293 			break;
294 		default:
295 			return (B_FALSE);
296 		}
297 	}
298 
299 	return (B_TRUE);
300 }
301 
302 static boolean_t
303 rustleg_parse_special(rust_state_t *restrict st, strview_t *restrict svp)
304 {
305 	if (HAS_ERROR(st))
306 		return (B_FALSE);
307 
308 	if (sv_peek(svp, 0) != '$')
309 		return (B_FALSE);
310 
311 	for (size_t i = 0; i < rust_charmap_sz; i++) {
312 		if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
313 			if (!rust_appendc(st, rust_charmap[i].ruc_ch))
314 				return (B_FALSE);
315 			return (B_TRUE);
316 		}
317 	}
318 
319 	/* Handle $uXXXX$ */
320 
321 	strview_t sv;
322 	uint32_t val = 0;
323 	uint_t ndigits = 0;
324 
325 	sv_init_sv(&sv, svp);
326 
327 	/* We peeked at this earlier, so it should still be there */
328 	VERIFY(sv_consume_if_c(&sv, '$'));
329 
330 	if (!sv_consume_if_c(&sv, 'u'))
331 		return (B_FALSE);
332 
333 	while (sv_remaining(&sv) > 0) {
334 		uint32_t cval = 0;
335 		char c;
336 
337 		if (ndigits == 4)
338 			return (B_FALSE);
339 
340 		c = sv_consume_c(&sv);
341 		if (c >= '0' && c <= '9')
342 			cval = c - '0';
343 		else if (c >= 'a' && c <= 'f')
344 			cval = c - 'a' + 10;
345 		else if (c == '$')
346 			break;
347 		else
348 			return (B_FALSE);
349 
350 		val <<= 4;
351 		val |= cval;
352 		ndigits++;
353 	}
354 
355 	if (!rust_append_utf8_c(st, val))
356 		return (B_FALSE);
357 
358 	sv_consume_n(svp, ndigits + 3);
359 	return (B_TRUE);
360 }
361 
362 static boolean_t
363 rustleg_add_sep(rust_state_t *st)
364 {
365 	if (HAS_ERROR(st))
366 		return (B_FALSE);
367 
368 	return (rust_append(st, "::"));
369 }
370 
371 static boolean_t
372 rustleg_valid_sym(const strview_t *sv)
373 {
374 	size_t i;
375 
376 	for (i = 0; i < sv->sv_rem; i++) {
377 		char c = sv->sv_first[i];
378 
379 		if ((c & 0x80) == 0)
380 			continue;
381 		DEMDEBUG("%s: ERROR found 8-bit character '%c' in '%.*s' "
382 		    "at index %zu", __func__, c, SV_PRINT(sv), i);
383 		return (B_FALSE);
384 	}
385 	return (B_TRUE);
386 }
387