1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2021 Jason King
15 */
16
17 #include <inttypes.h>
18 #include <libcustr.h>
19 #include <limits.h>
20 #include <string.h>
21 #include <sys/byteorder.h>
22 #include "rust.h"
23 #include "strview.h"
24
25 /*
26 * The rust v0 encoding (rust RFC 2603) uses a slightly modified
27 * version of punycode to encode characters that are not ASCII.
28 * The big difference is that '_' is used to separate the ASCII codepoints
29 * from the non-ASCII code points instead of '-'.
30 *
31 * The decoding is taken almost directly from (IETF) RFC 3492
32 */
33
34 #define BASE 36
35 #define TMIN 1
36 #define TMAX 26
37 #define SKEW 38
38 #define DAMP 700
39 #define INITIAL_BIAS 72
40 #define INITIAL_N 0x80
41 #define DELIMITER '_'
42
43 static inline uint32_t char_val(char);
44
45 static size_t
rustv0_puny_adapt(size_t delta,size_t npoints,boolean_t first)46 rustv0_puny_adapt(size_t delta, size_t npoints, boolean_t first)
47 {
48 size_t k = 0;
49
50 delta = first ? delta / DAMP : delta / 2;
51 delta += delta / npoints;
52 while (delta > ((BASE - TMIN) * TMAX) / 2) {
53 delta /= (BASE - TMIN);
54 k += BASE;
55 }
56
57 return (k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)));
58 }
59
60 boolean_t
rustv0_puny_decode(rust_state_t * restrict st,strview_t * restrict src,boolean_t repl_underscore)61 rustv0_puny_decode(rust_state_t *restrict st, strview_t *restrict src,
62 boolean_t repl_underscore)
63 {
64 uint32_t *buf;
65 size_t bufalloc; /* in units of uint32_t */
66 size_t buflen;
67 size_t nbasic;
68 size_t i, old_i, k, w;
69 size_t n = INITIAL_N;
70 size_t bias = INITIAL_BIAS;
71 size_t delim_idx = 0;
72 boolean_t ret = B_FALSE;
73 char c;
74
75 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(src));
76
77 /*
78 * The decoded string should never contain more codepoints than
79 * the original string, so creating a temporary buffer large
80 * enought to hold sv_remaining(src) uint32_t's should be
81 * large enough.
82 *
83 * This also serves as a size check -- xcalloc will fail if the
84 * resulting size of the buf (sizeof (uint32_t) * bufalloc) >=
85 * SIZE_MAX. If xcalloc succeeds, we therefore know that that
86 * buflen cannot overflow.
87 */
88 buflen = 0;
89 bufalloc = sv_remaining(src) + 1;
90 buf = xcalloc(st->rs_ops, bufalloc, sizeof (uint32_t));
91 if (buf == NULL) {
92 SET_ERROR(st);
93 return (B_FALSE);
94 }
95
96 /*
97 * Find the position of the last delimiter (if any).
98 * IETF RFC 3492 3.1 states that the delimiter is present if and only
99 * if there are a non-zero number of basic (ASCII) code points. Since
100 * the delimiter itself is a basic code point, the last one present
101 * in the original string is the actual delimiter between the basic
102 * and non-basic code points. Earlier occurences of the delimiter
103 * are treated as normal basic code points. For plain punycode, an
104 * all ASCII string encoded with punycode would terminate with a
105 * final delimiter, and a name with all non-basic code points would
106 * not have a delimiter at all. With the rust v0 encoding, punycode
107 * encoded identifiers have a 'u' prefix prior to the identifier
108 * length (['u'] <decimal-number> <bytes>), so we should never
109 * encounter an all ASCII name that's encoded with punycode (we error
110 * on this). For an all non-basic codepoint identifier, no delimiter
111 * will be present, and we treat that the same as the delimiter being
112 * in the first position of the string, and consume it (if present)
113 * when we transition from copying the basic code points (which there
114 * will be none in this situation) to non-basic code points.
115 */
116 for (i = 0; i < src->sv_rem; i++) {
117 if (src->sv_first[i] == DELIMITER) {
118 delim_idx = i;
119 }
120 }
121 VERIFY3U(delim_idx, <, bufalloc);
122
123 if (delim_idx + 1 == sv_remaining(src)) {
124 DEMDEBUG("%s: encountered an all-ASCII name encoded with "
125 "punycode", __func__);
126 goto done;
127 }
128
129 /* Copy all the basic characters up to the delimiter into buf */
130 for (nbasic = 0; nbasic < delim_idx; nbasic++) {
131 c = sv_consume_c(src);
132
133 /* The rust prefix check should guarantee this */
134 VERIFY3U(c, <, 0x80);
135
136 /*
137 * Normal rust identifiers do not contain '-' in them.
138 * However ABI identifiers could contain a dash. Those
139 * are translated to _, and we need to replace accordingly
140 * when asked.
141 */
142 if (repl_underscore && c == '_')
143 c = '-';
144
145 buf[nbasic] = c;
146 buflen++;
147 }
148 DEMDEBUG("%s: %" PRIu32 " ASCII codepoints copied", __func__, nbasic);
149
150 /*
151 * Consume delimiter between basic and non-basic code points if present.
152 * See above for explanation why it may not be present.
153 */
154 (void) sv_consume_if_c(src, DELIMITER);
155
156 DEMDEBUG("%s: non-ASCII codepoints to decode: %.*s", __func__,
157 SV_PRINT(src));
158
159 for (i = 0; sv_remaining(src) > 0; i++) {
160 VERIFY3U(i, <=, buflen);
161
162 /*
163 * Guarantee we have enough space to insert another codepoint.
164 * Our buffer sizing above should prevent this from ever
165 * tripping, but check this out of paranoia.
166 */
167 VERIFY3U(buflen, <, bufalloc - 1);
168
169 /* decode the next codepoint */
170 for (old_i = i, k = BASE, w = 1; ; k += BASE) {
171 size_t t;
172 uint32_t digit;
173
174 if (sv_remaining(src) == 0)
175 goto done;
176
177 digit = char_val(sv_consume_c(src));
178 if (digit >= BASE)
179 goto done;
180
181 i = i + digit * w;
182
183 if (k <= bias)
184 t = TMIN;
185 else if (k >= bias + TMAX)
186 t = TMAX;
187 else
188 t = k - bias;
189
190 if (digit < t)
191 break;
192
193 w = w * (BASE - t);
194 }
195 buflen++;
196
197 bias = rustv0_puny_adapt(i - old_i, buflen,
198 (old_i == 0) ? B_TRUE : B_FALSE);
199 n = n + i / buflen;
200 i = i % buflen;
201
202 DEMDEBUG("%s: insert \\u%04" PRIx32 " at index %zu (len = %zu)",
203 __func__, n, i, buflen);
204
205 /*
206 * At the start of this while loop, we guaranteed
207 * buflen < bufalloc - 1. Therefore we know there is room
208 * to move over the contents of buf at i to make room
209 * for the codepoint. We also just guaranteed that i
210 * is in the range [0, buflen), so this should always be
211 * safe.
212 */
213 (void) memmove(buf + i + 1, buf + i,
214 (buflen - i) * sizeof (uint32_t));
215
216 #if _LP64
217 /*
218 * This is always false for ILP32 and smatch will also complain,
219 * so we just omit it for ILP32.
220 */
221 if (n > UINT32_MAX) {
222 DEMDEBUG("%s: ERROR: utf8 value is out of range",
223 __func__);
224 goto done;
225 }
226 #endif
227
228 buf[i] = (uint32_t)n;
229 }
230
231 DEMDEBUG("%s: inserted %zu non-basic code points", __func__,
232 buflen - nbasic);
233
234 for (i = 0; i < buflen; i++) {
235 if (!rust_append_utf8_c(st, buf[i]))
236 goto done;
237 }
238 ret = B_TRUE;
239
240 done:
241 xfree(st->rs_ops, buf, bufalloc * sizeof (uint32_t));
242 return (ret);
243 }
244
245 /*
246 * Convert [0-9][a-z] to a value [0..35]. Rust's punycode encoding always
247 * uses lowercase, so we treat uppercase (and any other characters) as
248 * invalid, and return BASE (36) to indicate a bad value.
249 */
250 static inline uint32_t
char_val(char c)251 char_val(char c)
252 {
253 uint32_t v = c;
254
255 if (ISLOWER(c)) {
256 return (c - 'a');
257 } else if (ISDIGIT(c)) {
258 return (c - '0' + 26);
259 } else {
260 DEMDEBUG("%s: ERROR: invalid character 0x%02x encountered",
261 __func__, v);
262 return (BASE);
263 }
264 }
265