1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Some of the source code in this file came from fs/cifs/cifs_unicode.c 4 * 5 * Copyright (c) International Business Machines Corp., 2000,2009 6 * Modified by Steve French (sfrench@us.ibm.com) 7 * Modified by Namjae Jeon (linkinjeon@kernel.org) 8 */ 9 #include <linux/fs.h> 10 #include <linux/slab.h> 11 #include <linux/unaligned.h> 12 #include "glob.h" 13 #include "unicode.h" 14 #include "smb_common.h" 15 16 /* 17 * cifs_mapchar() - convert a host-endian char to proper char in codepage 18 * @target: where converted character should be copied 19 * @from: host-endian source string 20 * @cp: codepage to which character should be converted 21 * @mapchar: should character be mapped according to mapchars mount option? 22 * 23 * This function handles the conversion of a single character. It is the 24 * responsibility of the caller to ensure that the target buffer is large 25 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). 26 * 27 * Return: string length after conversion 28 */ 29 static int 30 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, 31 bool mapchar) 32 { 33 int len = 1; 34 __u16 src_char; 35 36 src_char = *from; 37 38 if (!mapchar) 39 goto cp_convert; 40 41 /* 42 * BB: Cannot handle remapping UNI_SLASH until all the calls to 43 * build_path_from_dentry are modified, as they use slash as 44 * separator. 45 */ 46 switch (src_char) { 47 case UNI_COLON: 48 *target = ':'; 49 break; 50 case UNI_ASTERISK: 51 *target = '*'; 52 break; 53 case UNI_QUESTION: 54 *target = '?'; 55 break; 56 case UNI_PIPE: 57 *target = '|'; 58 break; 59 case UNI_GRTRTHAN: 60 *target = '>'; 61 break; 62 case UNI_LESSTHAN: 63 *target = '<'; 64 break; 65 default: 66 goto cp_convert; 67 } 68 69 out: 70 return len; 71 72 cp_convert: 73 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); 74 if (len <= 0) 75 goto surrogate_pair; 76 77 goto out; 78 79 surrogate_pair: 80 /* convert SURROGATE_PAIR and IVS */ 81 if (strcmp(cp->charset, "utf8")) 82 goto unknown; 83 len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); 84 if (len <= 0) 85 goto unknown; 86 return len; 87 88 unknown: 89 *target = '?'; 90 len = 1; 91 goto out; 92 } 93 94 /* 95 * smb_utf16_bytes() - compute converted string length 96 * @from: pointer to input string 97 * @maxbytes: input string length 98 * @codepage: destination codepage 99 * 100 * Walk a utf16le string and return the number of bytes that the string will 101 * be after being converted to the given charset, not including any null 102 * termination required. Don't walk past maxbytes in the source buffer. 103 * 104 * Return: string length after conversion 105 */ 106 static int smb_utf16_bytes(const __le16 *from, int maxbytes, 107 const struct nls_table *codepage) 108 { 109 int i, j; 110 int charlen, outlen = 0; 111 int maxwords = maxbytes / 2; 112 char tmp[NLS_MAX_CHARSET_SIZE]; 113 __u16 ftmp[3]; 114 115 for (i = 0; i < maxwords; i++) { 116 ftmp[0] = get_unaligned_le16(&from[i]); 117 if (ftmp[0] == 0) 118 break; 119 for (j = 1; j <= 2; j++) { 120 if (i + j < maxwords) 121 ftmp[j] = get_unaligned_le16(&from[i + j]); 122 else 123 ftmp[j] = 0; 124 } 125 126 charlen = cifs_mapchar(tmp, ftmp, codepage, 0); 127 if (charlen > 0) 128 outlen += charlen; 129 else 130 outlen++; 131 } 132 133 return outlen; 134 } 135 136 /* 137 * smb_from_utf16() - convert utf16le string to local charset 138 * @to: destination buffer 139 * @from: source buffer 140 * @tolen: destination buffer size (in bytes) 141 * @fromlen: source buffer size (in bytes) 142 * @codepage: codepage to which characters should be converted 143 * @mapchar: should characters be remapped according to the mapchars option? 144 * 145 * Convert a little-endian utf16le string (as sent by the server) to a string 146 * in the provided codepage. The tolen and fromlen parameters are to ensure 147 * that the code doesn't walk off of the end of the buffer (which is always 148 * a danger if the alignment of the source buffer is off). The destination 149 * string is always properly null terminated and fits in the destination 150 * buffer. Returns the length of the destination string in bytes (including 151 * null terminator). 152 * 153 * Note that some windows versions actually send multiword UTF-16 characters 154 * instead of straight UTF16-2. The linux nls routines however aren't able to 155 * deal with those characters properly. In the event that we get some of 156 * those characters, they won't be translated properly. 157 * 158 * Return: string length after conversion 159 */ 160 static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, 161 const struct nls_table *codepage, bool mapchar) 162 { 163 int i, j, charlen, safelen; 164 int outlen = 0; 165 int nullsize = nls_nullsize(codepage); 166 int fromwords = fromlen / 2; 167 char tmp[NLS_MAX_CHARSET_SIZE]; 168 __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ 169 170 /* 171 * because the chars can be of varying widths, we need to take care 172 * not to overflow the destination buffer when we get close to the 173 * end of it. Until we get to this offset, we don't need to check 174 * for overflow however. 175 */ 176 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); 177 178 for (i = 0; i < fromwords; i++) { 179 ftmp[0] = get_unaligned_le16(&from[i]); 180 if (ftmp[0] == 0) 181 break; 182 for (j = 1; j <= 2; j++) { 183 if (i + j < fromwords) 184 ftmp[j] = get_unaligned_le16(&from[i + j]); 185 else 186 ftmp[j] = 0; 187 } 188 189 /* 190 * check to see if converting this character might make the 191 * conversion bleed into the null terminator 192 */ 193 if (outlen >= safelen) { 194 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); 195 if ((outlen + charlen) > (tolen - nullsize)) 196 break; 197 } 198 199 /* put converted char into 'to' buffer */ 200 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); 201 outlen += charlen; 202 203 /* 204 * charlen (=bytes of UTF-8 for 1 character) 205 * 4bytes UTF-8(surrogate pair) is charlen=4 206 * (4bytes UTF-16 code) 207 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 208 * (2 UTF-8 pairs divided to 2 UTF-16 pairs) 209 */ 210 if (charlen == 4) 211 i++; 212 else if (charlen >= 5) 213 /* 5-6bytes UTF-8 */ 214 i += 2; 215 } 216 217 /* properly null-terminate string */ 218 for (i = 0; i < nullsize; i++) 219 to[outlen++] = 0; 220 221 return outlen; 222 } 223 224 /* 225 * smb_strtoUTF16() - Convert character string to unicode string 226 * @to: destination buffer 227 * @from: source buffer 228 * @len: destination buffer size (in bytes) 229 * @codepage: codepage to which characters should be converted 230 * 231 * Return: string length after conversion 232 */ 233 int smb_strtoUTF16(__le16 *to, const char *from, int len, 234 const struct nls_table *codepage) 235 { 236 int charlen; 237 int i; 238 wchar_t wchar_to; /* needed to quiet sparse */ 239 240 /* special case for utf8 to handle no plane0 chars */ 241 if (!strcmp(codepage->charset, "utf8")) { 242 /* 243 * convert utf8 -> utf16, we assume we have enough space 244 * as caller should have assumed conversion does not overflow 245 * in destination len is length in wchar_t units (16bits) 246 */ 247 i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN, 248 (wchar_t *)to, len); 249 250 /* if success terminate and exit */ 251 if (i >= 0) 252 goto success; 253 /* 254 * if fails fall back to UCS encoding as this 255 * function should not return negative values 256 * currently can fail only if source contains 257 * invalid encoded characters 258 */ 259 } 260 261 for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) { 262 charlen = codepage->char2uni(from, len, &wchar_to); 263 if (charlen < 1) { 264 /* A question mark */ 265 wchar_to = 0x003f; 266 charlen = 1; 267 } 268 put_unaligned_le16(wchar_to, &to[i]); 269 } 270 271 success: 272 put_unaligned_le16(0, &to[i]); 273 return i; 274 } 275 276 /* 277 * smb_strndup_from_utf16() - copy a string from wire format to the local 278 * codepage 279 * @src: source string 280 * @maxlen: don't walk past this many bytes in the source string 281 * @is_unicode: is this a unicode string? 282 * @codepage: destination codepage 283 * 284 * Take a string given by the server, convert it to the local codepage and 285 * put it in a new buffer. Returns a pointer to the new string or NULL on 286 * error. 287 * 288 * Return: destination string buffer or error ptr 289 */ 290 char *smb_strndup_from_utf16(const char *src, const int maxlen, 291 const bool is_unicode, 292 const struct nls_table *codepage) 293 { 294 int len, ret; 295 char *dst; 296 297 if (is_unicode) { 298 len = smb_utf16_bytes((__le16 *)src, maxlen, codepage); 299 len += nls_nullsize(codepage); 300 dst = kmalloc(len, GFP_KERNEL); 301 if (!dst) 302 return ERR_PTR(-ENOMEM); 303 ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage, 304 false); 305 if (ret < 0) { 306 kfree(dst); 307 return ERR_PTR(-EINVAL); 308 } 309 } else { 310 len = strnlen(src, maxlen); 311 len++; 312 dst = kmalloc(len, GFP_KERNEL); 313 if (!dst) 314 return ERR_PTR(-ENOMEM); 315 strscpy(dst, src, len); 316 } 317 318 return dst; 319 } 320 321 /* 322 * Convert 16 bit Unicode pathname to wire format from string in current code 323 * page. Conversion may involve remapping up the six characters that are 324 * only legal in POSIX-like OS (if they are present in the string). Path 325 * names are little endian 16 bit Unicode on the wire 326 */ 327 /* 328 * smbConvertToUTF16() - convert string from local charset to utf16 329 * @target: destination buffer 330 * @source: source buffer 331 * @srclen: source buffer size (in bytes) 332 * @cp: codepage to which characters should be converted 333 * @mapchar: should characters be remapped according to the mapchars option? 334 * 335 * Convert 16 bit Unicode pathname to wire format from string in current code 336 * page. Conversion may involve remapping up the six characters that are 337 * only legal in POSIX-like OS (if they are present in the string). Path 338 * names are little endian 16 bit Unicode on the wire 339 * 340 * Return: char length after conversion 341 */ 342 int smbConvertToUTF16(__le16 *target, const char *source, int srclen, 343 const struct nls_table *cp, int mapchars) 344 { 345 int i, j, charlen; 346 char src_char; 347 __le16 dst_char; 348 wchar_t tmp; 349 wchar_t wchar_to[6]; /* UTF-16 */ 350 int ret; 351 unicode_t u; 352 353 if (!mapchars) 354 return smb_strtoUTF16(target, source, srclen, cp); 355 356 for (i = 0, j = 0; i < srclen; j++) { 357 src_char = source[i]; 358 charlen = 1; 359 switch (src_char) { 360 case 0: 361 put_unaligned(0, &target[j]); 362 return j; 363 case ':': 364 dst_char = cpu_to_le16(UNI_COLON); 365 break; 366 case '*': 367 dst_char = cpu_to_le16(UNI_ASTERISK); 368 break; 369 case '?': 370 dst_char = cpu_to_le16(UNI_QUESTION); 371 break; 372 case '<': 373 dst_char = cpu_to_le16(UNI_LESSTHAN); 374 break; 375 case '>': 376 dst_char = cpu_to_le16(UNI_GRTRTHAN); 377 break; 378 case '|': 379 dst_char = cpu_to_le16(UNI_PIPE); 380 break; 381 /* 382 * FIXME: We can not handle remapping backslash (UNI_SLASH) 383 * until all the calls to build_path_from_dentry are modified, 384 * as they use backslash as separator. 385 */ 386 default: 387 charlen = cp->char2uni(source + i, srclen - i, &tmp); 388 dst_char = cpu_to_le16(tmp); 389 390 /* 391 * if no match, use question mark, which at least in 392 * some cases serves as wild card 393 */ 394 if (charlen > 0) 395 goto ctoUTF16; 396 397 /* convert SURROGATE_PAIR */ 398 if (strcmp(cp->charset, "utf8")) 399 goto unknown; 400 if (*(source + i) & 0x80) { 401 charlen = utf8_to_utf32(source + i, 6, &u); 402 if (charlen < 0) 403 goto unknown; 404 } else 405 goto unknown; 406 ret = utf8s_to_utf16s(source + i, charlen, 407 UTF16_LITTLE_ENDIAN, 408 wchar_to, 6); 409 if (ret < 0) 410 goto unknown; 411 412 i += charlen; 413 dst_char = cpu_to_le16(*wchar_to); 414 if (charlen <= 3) 415 /* 1-3bytes UTF-8 to 2bytes UTF-16 */ 416 put_unaligned(dst_char, &target[j]); 417 else if (charlen == 4) { 418 /* 419 * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 420 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 421 * (charlen=3+4 or 4+4) 422 */ 423 put_unaligned(dst_char, &target[j]); 424 dst_char = cpu_to_le16(*(wchar_to + 1)); 425 j++; 426 put_unaligned(dst_char, &target[j]); 427 } else if (charlen >= 5) { 428 /* 5-6bytes UTF-8 to 6bytes UTF-16 */ 429 put_unaligned(dst_char, &target[j]); 430 dst_char = cpu_to_le16(*(wchar_to + 1)); 431 j++; 432 put_unaligned(dst_char, &target[j]); 433 dst_char = cpu_to_le16(*(wchar_to + 2)); 434 j++; 435 put_unaligned(dst_char, &target[j]); 436 } 437 continue; 438 439 unknown: 440 dst_char = cpu_to_le16(0x003f); 441 charlen = 1; 442 } 443 444 ctoUTF16: 445 /* 446 * character may take more than one byte in the source string, 447 * but will take exactly two bytes in the target string 448 */ 449 i += charlen; 450 put_unaligned(dst_char, &target[j]); 451 } 452 453 return j; 454 } 455