1 /* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21 #include "udfdecl.h" 22 23 #include <linux/kernel.h> 24 #include <linux/string.h> /* for memset */ 25 #include <linux/nls.h> 26 #include <linux/crc-itu-t.h> 27 #include <linux/slab.h> 28 29 #include "udf_sb.h" 30 31 static int udf_uni2char_utf8(wchar_t uni, 32 unsigned char *out, 33 int boundlen) 34 { 35 int u_len = 0; 36 37 if (boundlen <= 0) 38 return -ENAMETOOLONG; 39 40 if (uni < 0x80) { 41 out[u_len++] = (unsigned char)uni; 42 } else if (uni < 0x800) { 43 if (boundlen < 2) 44 return -ENAMETOOLONG; 45 out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); 46 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 47 } else { 48 if (boundlen < 3) 49 return -ENAMETOOLONG; 50 out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); 51 out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); 52 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 53 } 54 return u_len; 55 } 56 57 static int udf_char2uni_utf8(const unsigned char *in, 58 int boundlen, 59 wchar_t *uni) 60 { 61 unsigned int utf_char; 62 unsigned char c; 63 int utf_cnt, u_len; 64 65 utf_char = 0; 66 utf_cnt = 0; 67 for (u_len = 0; u_len < boundlen;) { 68 c = in[u_len++]; 69 70 /* Complete a multi-byte UTF-8 character */ 71 if (utf_cnt) { 72 utf_char = (utf_char << 6) | (c & 0x3f); 73 if (--utf_cnt) 74 continue; 75 } else { 76 /* Check for a multi-byte UTF-8 character */ 77 if (c & 0x80) { 78 /* Start a multi-byte UTF-8 character */ 79 if ((c & 0xe0) == 0xc0) { 80 utf_char = c & 0x1f; 81 utf_cnt = 1; 82 } else if ((c & 0xf0) == 0xe0) { 83 utf_char = c & 0x0f; 84 utf_cnt = 2; 85 } else if ((c & 0xf8) == 0xf0) { 86 utf_char = c & 0x07; 87 utf_cnt = 3; 88 } else if ((c & 0xfc) == 0xf8) { 89 utf_char = c & 0x03; 90 utf_cnt = 4; 91 } else if ((c & 0xfe) == 0xfc) { 92 utf_char = c & 0x01; 93 utf_cnt = 5; 94 } else { 95 utf_cnt = -1; 96 break; 97 } 98 continue; 99 } else { 100 /* Single byte UTF-8 character (most common) */ 101 utf_char = c; 102 } 103 } 104 *uni = utf_char; 105 break; 106 } 107 if (utf_cnt) { 108 *uni = '?'; 109 return -EINVAL; 110 } 111 return u_len; 112 } 113 114 #define ILLEGAL_CHAR_MARK '_' 115 #define EXT_MARK '.' 116 #define CRC_MARK '#' 117 #define EXT_SIZE 5 118 /* Number of chars we need to store generated CRC to make filename unique */ 119 #define CRC_LEN 5 120 121 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, 122 int *str_o_idx, 123 const uint8_t *str_i, int str_i_max_len, 124 int *str_i_idx, 125 int u_ch, int *needsCRC, 126 int (*conv_f)(wchar_t, unsigned char *, int), 127 int translate) 128 { 129 uint32_t c; 130 int illChar = 0; 131 int len, gotch = 0; 132 133 for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { 134 if (*str_o_idx >= str_o_max_len) { 135 *needsCRC = 1; 136 return gotch; 137 } 138 139 /* Expand OSTA compressed Unicode to Unicode */ 140 c = str_i[*str_i_idx]; 141 if (u_ch > 1) 142 c = (c << 8) | str_i[*str_i_idx + 1]; 143 144 if (translate && (c == '/' || c == 0)) 145 illChar = 1; 146 else if (illChar) 147 break; 148 else 149 gotch = 1; 150 } 151 if (illChar) { 152 *needsCRC = 1; 153 c = ILLEGAL_CHAR_MARK; 154 gotch = 1; 155 } 156 if (gotch) { 157 len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); 158 /* Valid character? */ 159 if (len >= 0) 160 *str_o_idx += len; 161 else if (len == -ENAMETOOLONG) { 162 *needsCRC = 1; 163 gotch = 0; 164 } else { 165 str_o[(*str_o_idx)++] = '?'; 166 *needsCRC = 1; 167 } 168 } 169 return gotch; 170 } 171 172 static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, 173 const uint8_t *ocu, int ocu_len, 174 int (*conv_f)(wchar_t, unsigned char *, int), 175 int translate) 176 { 177 uint32_t c; 178 uint8_t cmp_id; 179 int idx, len; 180 int u_ch; 181 int needsCRC = 0; 182 int ext_i_len, ext_max_len; 183 int str_o_len = 0; /* Length of resulting output */ 184 int ext_o_len = 0; /* Extension output length */ 185 int ext_crc_len = 0; /* Extension output length if used with CRC */ 186 int i_ext = -1; /* Extension position in input buffer */ 187 int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ 188 unsigned short valueCRC; 189 uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; 190 uint8_t crc[CRC_LEN]; 191 192 if (str_max_len <= 0) 193 return 0; 194 195 if (ocu_len == 0) { 196 memset(str_o, 0, str_max_len); 197 return 0; 198 } 199 200 cmp_id = ocu[0]; 201 if (cmp_id != 8 && cmp_id != 16) { 202 memset(str_o, 0, str_max_len); 203 pr_err("unknown compression code (%d)\n", cmp_id); 204 return -EINVAL; 205 } 206 u_ch = cmp_id >> 3; 207 208 ocu++; 209 ocu_len--; 210 211 if (ocu_len % u_ch) { 212 pr_err("incorrect filename length (%d)\n", ocu_len + 1); 213 return -EINVAL; 214 } 215 216 if (translate) { 217 /* Look for extension */ 218 for (idx = ocu_len - u_ch, ext_i_len = 0; 219 (idx >= 0) && (ext_i_len < EXT_SIZE); 220 idx -= u_ch, ext_i_len++) { 221 c = ocu[idx]; 222 if (u_ch > 1) 223 c = (c << 8) | ocu[idx + 1]; 224 225 if (c == EXT_MARK) { 226 if (ext_i_len) 227 i_ext = idx; 228 break; 229 } 230 } 231 if (i_ext >= 0) { 232 /* Convert extension */ 233 ext_max_len = min_t(int, sizeof(ext), str_max_len); 234 ext[ext_o_len++] = EXT_MARK; 235 idx = i_ext + u_ch; 236 while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, 237 ocu, ocu_len, &idx, 238 u_ch, &needsCRC, 239 conv_f, translate)) { 240 if ((ext_o_len + CRC_LEN) < str_max_len) 241 ext_crc_len = ext_o_len; 242 } 243 } 244 } 245 246 idx = 0; 247 while (1) { 248 if (translate && (idx == i_ext)) { 249 if (str_o_len > (str_max_len - ext_o_len)) 250 needsCRC = 1; 251 break; 252 } 253 254 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, 255 ocu, ocu_len, &idx, 256 u_ch, &needsCRC, conv_f, translate)) 257 break; 258 259 if (translate && 260 (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) 261 o_crc = str_o_len; 262 } 263 264 if (translate) { 265 if (str_o_len <= 2 && str_o[0] == '.' && 266 (str_o_len == 1 || str_o[1] == '.')) 267 needsCRC = 1; 268 if (needsCRC) { 269 str_o_len = o_crc; 270 valueCRC = crc_itu_t(0, ocu, ocu_len); 271 crc[0] = CRC_MARK; 272 crc[1] = hex_asc_upper_hi(valueCRC >> 8); 273 crc[2] = hex_asc_upper_lo(valueCRC >> 8); 274 crc[3] = hex_asc_upper_hi(valueCRC); 275 crc[4] = hex_asc_upper_lo(valueCRC); 276 len = min_t(int, CRC_LEN, str_max_len - str_o_len); 277 memcpy(&str_o[str_o_len], crc, len); 278 str_o_len += len; 279 ext_o_len = ext_crc_len; 280 } 281 if (ext_o_len > 0) { 282 memcpy(&str_o[str_o_len], ext, ext_o_len); 283 str_o_len += ext_o_len; 284 } 285 } 286 287 return str_o_len; 288 } 289 290 static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, 291 const uint8_t *str_i, int str_len, 292 int (*conv_f)(const unsigned char *, int, wchar_t *)) 293 { 294 int i, len; 295 unsigned int max_val; 296 wchar_t uni_char; 297 int u_len, u_ch; 298 299 if (ocu_max_len <= 0) 300 return 0; 301 302 memset(ocu, 0, ocu_max_len); 303 ocu[0] = 8; 304 max_val = 0xff; 305 u_ch = 1; 306 307 try_again: 308 u_len = 1; 309 for (i = 0; i < str_len; i++) { 310 /* Name didn't fit? */ 311 if (u_len + u_ch > ocu_max_len) 312 return 0; 313 len = conv_f(&str_i[i], str_len - i, &uni_char); 314 if (!len) 315 continue; 316 /* Invalid character, deal with it */ 317 if (len < 0) { 318 len = 1; 319 uni_char = '?'; 320 } 321 322 if (uni_char > max_val) { 323 max_val = 0xffff; 324 ocu[0] = 0x10; 325 u_ch = 2; 326 goto try_again; 327 } 328 329 if (max_val == 0xffff) 330 ocu[u_len++] = (uint8_t)(uni_char >> 8); 331 ocu[u_len++] = (uint8_t)(uni_char & 0xff); 332 i += len - 1; 333 } 334 335 return u_len; 336 } 337 338 int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, 339 const uint8_t *ocu_i, int i_len) 340 { 341 int s_len = 0; 342 343 if (i_len > 0) { 344 s_len = ocu_i[i_len - 1]; 345 if (s_len >= i_len) { 346 pr_err("incorrect dstring lengths (%d/%d)\n", 347 s_len, i_len); 348 return -EINVAL; 349 } 350 } 351 352 return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, 353 udf_uni2char_utf8, 0); 354 } 355 356 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, 357 uint8_t *dname, int dlen) 358 { 359 int (*conv_f)(wchar_t, unsigned char *, int); 360 int ret; 361 362 if (!slen) 363 return -EIO; 364 365 if (dlen <= 0) 366 return 0; 367 368 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 369 conv_f = udf_uni2char_utf8; 370 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 371 conv_f = UDF_SB(sb)->s_nls_map->uni2char; 372 } else 373 BUG(); 374 375 ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); 376 /* Zero length filename isn't valid... */ 377 if (ret == 0) 378 ret = -EINVAL; 379 return ret; 380 } 381 382 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, 383 uint8_t *dname, int dlen) 384 { 385 int (*conv_f)(const unsigned char *, int, wchar_t *); 386 387 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 388 conv_f = udf_char2uni_utf8; 389 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 390 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 391 } else 392 BUG(); 393 394 return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); 395 } 396 397