xref: /linux/tools/perf/util/demangle-rust.c (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <string.h>
3 #include "debug.h"
4 
5 #include "demangle-rust.h"
6 
7 /*
8  * Mangled Rust symbols look like this:
9  *
10  *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
11  *
12  * The original symbol is:
13  *
14  *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
15  *
16  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
17  * with "h". Rust does not have a global namespace between crates, an illusion
18  * which Rust maintains by using the hash to distinguish things that would
19  * otherwise have the same symbol.
20  *
21  * Any path component not starting with a XID_Start character is prefixed with
22  * "_".
23  *
24  * The following escape sequences are used:
25  *
26  *     ","  =>  $C$
27  *     "@"  =>  $SP$
28  *     "*"  =>  $BP$
29  *     "&"  =>  $RF$
30  *     "<"  =>  $LT$
31  *     ">"  =>  $GT$
32  *     "("  =>  $LP$
33  *     ")"  =>  $RP$
34  *     " "  =>  $u20$
35  *     "'"  =>  $u27$
36  *     "["  =>  $u5b$
37  *     "]"  =>  $u5d$
38  *     "~"  =>  $u7e$
39  *
40  * A double ".." means "::" and a single "." means "-".
41  *
42  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
43  */
44 
45 static const char *hash_prefix = "::h";
46 static const size_t hash_prefix_len = 3;
47 static const size_t hash_len = 16;
48 
49 static bool is_prefixed_hash(const char *start);
50 static bool looks_like_rust(const char *sym, size_t len);
51 static bool unescape(const char **in, char **out, const char *seq, char value);
52 
53 /*
54  * INPUT:
55  *     sym: symbol that has been through BFD-demangling
56  *
57  * This function looks for the following indicators:
58  *
59  *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
60  *
61  *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
62  *     hex digits. This is true of 99.9998% of hashes so once in your life you
63  *     may see a false negative. The point is to notice path components that
64  *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
65  *     this case a false positive (non-Rust symbol has an important path
66  *     component removed because it looks like a Rust hash) is worse than a
67  *     false negative (the rare Rust symbol is not demangled) so this sets the
68  *     balance in favor of false negatives.
69  *
70  *  3. There must be no characters other than a-zA-Z0-9 and _.:$
71  *
72  *  4. There must be no unrecognized $-sign sequences.
73  *
74  *  5. There must be no sequence of three or more dots in a row ("...").
75  */
76 bool
77 rust_is_mangled(const char *sym)
78 {
79 	size_t len, len_without_hash;
80 
81 	if (!sym)
82 		return false;
83 
84 	len = strlen(sym);
85 	if (len <= hash_prefix_len + hash_len)
86 		/* Not long enough to contain "::h" + hash + something else */
87 		return false;
88 
89 	len_without_hash = len - (hash_prefix_len + hash_len);
90 	if (!is_prefixed_hash(sym + len_without_hash))
91 		return false;
92 
93 	return looks_like_rust(sym, len_without_hash);
94 }
95 
96 /*
97  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
98  * digits must comprise between 5 and 15 (inclusive) distinct digits.
99  */
100 static bool is_prefixed_hash(const char *str)
101 {
102 	const char *end;
103 	bool seen[16];
104 	size_t i;
105 	int count;
106 
107 	if (strncmp(str, hash_prefix, hash_prefix_len))
108 		return false;
109 	str += hash_prefix_len;
110 
111 	memset(seen, false, sizeof(seen));
112 	for (end = str + hash_len; str < end; str++)
113 		if (*str >= '0' && *str <= '9')
114 			seen[*str - '0'] = true;
115 		else if (*str >= 'a' && *str <= 'f')
116 			seen[*str - 'a' + 10] = true;
117 		else
118 			return false;
119 
120 	/* Count how many distinct digits seen */
121 	count = 0;
122 	for (i = 0; i < 16; i++)
123 		if (seen[i])
124 			count++;
125 
126 	return count >= 5 && count <= 15;
127 }
128 
129 static bool looks_like_rust(const char *str, size_t len)
130 {
131 	const char *end = str + len;
132 
133 	while (str < end)
134 		switch (*str) {
135 		case '$':
136 			if (!strncmp(str, "$C$", 3))
137 				str += 3;
138 			else if (!strncmp(str, "$SP$", 4)
139 					|| !strncmp(str, "$BP$", 4)
140 					|| !strncmp(str, "$RF$", 4)
141 					|| !strncmp(str, "$LT$", 4)
142 					|| !strncmp(str, "$GT$", 4)
143 					|| !strncmp(str, "$LP$", 4)
144 					|| !strncmp(str, "$RP$", 4))
145 				str += 4;
146 			else if (!strncmp(str, "$u20$", 5)
147 					|| !strncmp(str, "$u27$", 5)
148 					|| !strncmp(str, "$u5b$", 5)
149 					|| !strncmp(str, "$u5d$", 5)
150 					|| !strncmp(str, "$u7e$", 5))
151 				str += 5;
152 			else
153 				return false;
154 			break;
155 		case '.':
156 			/* Do not allow three or more consecutive dots */
157 			if (!strncmp(str, "...", 3))
158 				return false;
159 			/* Fall through */
160 		case 'a' ... 'z':
161 		case 'A' ... 'Z':
162 		case '0' ... '9':
163 		case '_':
164 		case ':':
165 			str++;
166 			break;
167 		default:
168 			return false;
169 		}
170 
171 	return true;
172 }
173 
174 /*
175  * INPUT:
176  *     sym: symbol for which rust_is_mangled(sym) returns true
177  *
178  * The input is demangled in-place because the mangled name is always longer
179  * than the demangled one.
180  */
181 void
182 rust_demangle_sym(char *sym)
183 {
184 	const char *in;
185 	char *out;
186 	const char *end;
187 
188 	if (!sym)
189 		return;
190 
191 	in = sym;
192 	out = sym;
193 	end = sym + strlen(sym) - (hash_prefix_len + hash_len);
194 
195 	while (in < end)
196 		switch (*in) {
197 		case '$':
198 			if (!(unescape(&in, &out, "$C$", ',')
199 					|| unescape(&in, &out, "$SP$", '@')
200 					|| unescape(&in, &out, "$BP$", '*')
201 					|| unescape(&in, &out, "$RF$", '&')
202 					|| unescape(&in, &out, "$LT$", '<')
203 					|| unescape(&in, &out, "$GT$", '>')
204 					|| unescape(&in, &out, "$LP$", '(')
205 					|| unescape(&in, &out, "$RP$", ')')
206 					|| unescape(&in, &out, "$u20$", ' ')
207 					|| unescape(&in, &out, "$u27$", '\'')
208 					|| unescape(&in, &out, "$u5b$", '[')
209 					|| unescape(&in, &out, "$u5d$", ']')
210 					|| unescape(&in, &out, "$u7e$", '~'))) {
211 				pr_err("demangle-rust: unexpected escape sequence");
212 				goto done;
213 			}
214 			break;
215 		case '_':
216 			/*
217 			 * If this is the start of a path component and the next
218 			 * character is an escape sequence, ignore the
219 			 * underscore. The mangler inserts an underscore to make
220 			 * sure the path component begins with a XID_Start
221 			 * character.
222 			 */
223 			if ((in == sym || in[-1] == ':') && in[1] == '$')
224 				in++;
225 			else
226 				*out++ = *in++;
227 			break;
228 		case '.':
229 			if (in[1] == '.') {
230 				/* ".." becomes "::" */
231 				*out++ = ':';
232 				*out++ = ':';
233 				in += 2;
234 			} else {
235 				/* "." becomes "-" */
236 				*out++ = '-';
237 				in++;
238 			}
239 			break;
240 		case 'a' ... 'z':
241 		case 'A' ... 'Z':
242 		case '0' ... '9':
243 		case ':':
244 			*out++ = *in++;
245 			break;
246 		default:
247 			pr_err("demangle-rust: unexpected character '%c' in symbol\n",
248 				*in);
249 			goto done;
250 		}
251 
252 done:
253 	*out = '\0';
254 }
255 
256 static bool unescape(const char **in, char **out, const char *seq, char value)
257 {
258 	size_t len = strlen(seq);
259 
260 	if (strncmp(*in, seq, len))
261 		return false;
262 
263 	**out = value;
264 
265 	*in += len;
266 	*out += 1;
267 
268 	return true;
269 }
270