1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2021 Jason King 15 */ 16 17 #include <inttypes.h> 18 #include <libcustr.h> 19 #include <limits.h> 20 #include <string.h> 21 #include <sys/byteorder.h> 22 #include "rust.h" 23 #include "strview.h" 24 25 /* 26 * The rust v0 encoding (rust RFC 2603) uses a slightly modified 27 * version of punycode to encode characters that are not ASCII. 28 * The big difference is that '_' is used to separate the ASCII codepoints 29 * from the non-ASCII code points instead of '-'. 30 * 31 * The decoding is taken almost directly from (IETF) RFC 3492 32 */ 33 34 #define BASE 36 35 #define TMIN 1 36 #define TMAX 26 37 #define SKEW 38 38 #define DAMP 700 39 #define INITIAL_BIAS 72 40 #define INITIAL_N 0x80 41 #define DELIMITER '_' 42 43 static inline uint32_t char_val(char); 44 45 static size_t 46 rustv0_puny_adapt(size_t delta, size_t npoints, boolean_t first) 47 { 48 size_t k = 0; 49 50 delta = first ? delta / DAMP : delta / 2; 51 delta += delta / npoints; 52 while (delta > ((BASE - TMIN) * TMAX) / 2) { 53 delta /= (BASE - TMIN); 54 k += BASE; 55 } 56 57 return (k + (((BASE - TMIN + 1) * delta) / (delta + SKEW))); 58 } 59 60 boolean_t 61 rustv0_puny_decode(rust_state_t *restrict st, strview_t *restrict src, 62 boolean_t repl_underscore) 63 { 64 uint32_t *buf; 65 size_t bufalloc; /* in units of uint32_t */ 66 size_t buflen; 67 size_t nbasic; 68 size_t i, old_i, k, w; 69 size_t n = INITIAL_N; 70 size_t bias = INITIAL_BIAS; 71 size_t delim_idx = 0; 72 boolean_t ret = B_FALSE; 73 char c; 74 75 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(src)); 76 77 /* 78 * The decoded string should never contain more codepoints than 79 * the original string, so creating a temporary buffer large 80 * enought to hold sv_remaining(src) uint32_t's should be 81 * large enough. 82 * 83 * This also serves as a size check -- xcalloc will fail if the 84 * resulting size of the buf (sizeof (uint32_t) * bufalloc) >= 85 * SIZE_MAX. If xcalloc succeeds, we therefore know that that 86 * buflen cannot overflow. 87 */ 88 buflen = 0; 89 bufalloc = sv_remaining(src) + 1; 90 buf = xcalloc(st->rs_ops, bufalloc, sizeof (uint32_t)); 91 if (buf == NULL) { 92 SET_ERROR(st); 93 return (B_FALSE); 94 } 95 96 /* 97 * Find the position of the last delimiter (if any). 98 * IETF RFC 3492 3.1 states that the delimiter is present if and only 99 * if there are a non-zero number of basic (ASCII) code points. Since 100 * the delimiter itself is a basic code point, the last one present 101 * in the original string is the actual delimiter between the basic 102 * and non-basic code points. Earlier occurences of the delimiter 103 * are treated as normal basic code points. For plain punycode, an 104 * all ASCII string encoded with punycode would terminate with a 105 * final delimiter, and a name with all non-basic code points would 106 * not have a delimiter at all. With the rust v0 encoding, punycode 107 * encoded identifiers have a 'u' prefix prior to the identifier 108 * length (['u'] <decimal-number> <bytes>), so we should never 109 * encounter an all ASCII name that's encoded with punycode (we error 110 * on this). For an all non-basic codepoint identifier, no delimiter 111 * will be present, and we treat that the same as the delimiter being 112 * in the first position of the string, and consume it (if present) 113 * when we transition from copying the basic code points (which there 114 * will be none in this situation) to non-basic code points. 115 */ 116 for (i = 0; i < src->sv_rem; i++) { 117 if (src->sv_first[i] == DELIMITER) { 118 delim_idx = i; 119 } 120 } 121 VERIFY3U(delim_idx, <, bufalloc); 122 123 if (delim_idx + 1 == sv_remaining(src)) { 124 DEMDEBUG("%s: encountered an all-ASCII name encoded with " 125 "punycode", __func__); 126 goto done; 127 } 128 129 /* Copy all the basic characters up to the delimiter into buf */ 130 for (nbasic = 0; nbasic < delim_idx; nbasic++) { 131 c = sv_consume_c(src); 132 133 /* The rust prefix check should guarantee this */ 134 VERIFY3U(c, <, 0x80); 135 136 /* 137 * Normal rust identifiers do not contain '-' in them. 138 * However ABI identifiers could contain a dash. Those 139 * are translated to _, and we need to replace accordingly 140 * when asked. 141 */ 142 if (repl_underscore && c == '_') 143 c = '-'; 144 145 buf[nbasic] = c; 146 buflen++; 147 } 148 DEMDEBUG("%s: %" PRIu32 " ASCII codepoints copied", __func__, nbasic); 149 150 /* 151 * Consume delimiter between basic and non-basic code points if present. 152 * See above for explanation why it may not be present. 153 */ 154 (void) sv_consume_if_c(src, DELIMITER); 155 156 DEMDEBUG("%s: non-ASCII codepoints to decode: %.*s", __func__, 157 SV_PRINT(src)); 158 159 for (i = 0; sv_remaining(src) > 0; i++) { 160 VERIFY3U(i, <=, buflen); 161 162 /* 163 * Guarantee we have enough space to insert another codepoint. 164 * Our buffer sizing above should prevent this from ever 165 * tripping, but check this out of paranoia. 166 */ 167 VERIFY3U(buflen, <, bufalloc - 1); 168 169 /* decode the next codepoint */ 170 for (old_i = i, k = BASE, w = 1; ; k += BASE) { 171 size_t t; 172 uint32_t digit; 173 174 if (sv_remaining(src) == 0) 175 goto done; 176 177 digit = char_val(sv_consume_c(src)); 178 if (digit >= BASE) 179 goto done; 180 181 i = i + digit * w; 182 183 if (k <= bias) 184 t = TMIN; 185 else if (k >= bias + TMAX) 186 t = TMAX; 187 else 188 t = k - bias; 189 190 if (digit < t) 191 break; 192 193 w = w * (BASE - t); 194 } 195 buflen++; 196 197 bias = rustv0_puny_adapt(i - old_i, buflen, 198 (old_i == 0) ? B_TRUE : B_FALSE); 199 n = n + i / buflen; 200 i = i % buflen; 201 202 DEMDEBUG("%s: insert \\u%04" PRIx32 " at index %zu (len = %zu)", 203 __func__, n, i, buflen); 204 205 /* 206 * At the start of this while loop, we guaranteed 207 * buflen < bufalloc - 1. Therefore we know there is room 208 * to move over the contents of buf at i to make room 209 * for the codepoint. We also just guaranteed that i 210 * is in the range [0, buflen), so this should always be 211 * safe. 212 */ 213 (void) memmove(buf + i + 1, buf + i, 214 (buflen - i) * sizeof (uint32_t)); 215 216 #if _LP64 217 /* 218 * This is always false for ILP32 and smatch will also complain, 219 * so we just omit it for ILP32. 220 */ 221 if (n > UINT32_MAX) { 222 DEMDEBUG("%s: ERROR: utf8 value is out of range", 223 __func__); 224 goto done; 225 } 226 #endif 227 228 buf[i] = (uint32_t)n; 229 } 230 231 DEMDEBUG("%s: inserted %zu non-basic code points", __func__, 232 buflen - nbasic); 233 234 for (i = 0; i < buflen; i++) { 235 if (!rust_append_utf8_c(st, buf[i])) 236 goto done; 237 } 238 ret = B_TRUE; 239 240 done: 241 xfree(st->rs_ops, buf, bufalloc * sizeof (uint32_t)); 242 return (ret); 243 } 244 245 /* 246 * Convert [0-9][a-z] to a value [0..35]. Rust's punycode encoding always 247 * uses lowercase, so we treat uppercase (and any other characters) as 248 * invalid, and return BASE (36) to indicate a bad value. 249 */ 250 static inline uint32_t 251 char_val(char c) 252 { 253 uint32_t v = c; 254 255 if (ISLOWER(c)) { 256 return (c - 'a'); 257 } else if (ISDIGIT(c)) { 258 return (c - '0' + 26); 259 } else { 260 DEMDEBUG("%s: ERROR: invalid character 0x%02x encountered", 261 __func__, v); 262 return (BASE); 263 } 264 } 265