xref: /linux/fs/smb/client/cifs_unicode.c (revision 8a848efd482be65d488e888f96812d8729ea64ea)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *
4  *   Copyright (c) International Business Machines  Corp., 2000,2009
5  *   Modified by Steve French (sfrench@us.ibm.com)
6  */
7 #include <linux/fs.h>
8 #include <linux/slab.h>
9 #include "cifs_fs_sb.h"
10 #include "cifs_unicode.h"
11 #include "cifsglob.h"
12 #include "cifs_debug.h"
13 
14 int cifs_remap(struct cifs_sb_info *cifs_sb)
15 {
16 	int map_type;
17 
18 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
19 		map_type = SFM_MAP_UNI_RSVD;
20 	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
21 		map_type = SFU_MAP_UNI_RSVD;
22 	else
23 		map_type = NO_MAP_UNI_RSVD;
24 
25 	return map_type;
26 }
27 
28 /* Convert character using the SFU - "Services for Unix" remapping range */
29 static bool
30 convert_sfu_char(const __u16 src_char, char *target)
31 {
32 	/*
33 	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
34 	 *     build_path_from_dentry are modified, as they use slash as
35 	 *     separator.
36 	 */
37 	switch (src_char) {
38 	case UNI_COLON:
39 		*target = ':';
40 		break;
41 	case UNI_ASTERISK:
42 		*target = '*';
43 		break;
44 	case UNI_QUESTION:
45 		*target = '?';
46 		break;
47 	case UNI_PIPE:
48 		*target = '|';
49 		break;
50 	case UNI_GRTRTHAN:
51 		*target = '>';
52 		break;
53 	case UNI_LESSTHAN:
54 		*target = '<';
55 		break;
56 	default:
57 		return false;
58 	}
59 	return true;
60 }
61 
62 /* Convert character using the SFM - "Services for Mac" remapping range */
63 static bool
64 convert_sfm_char(const __u16 src_char, char *target)
65 {
66 	if (src_char >= 0xF001 && src_char <= 0xF01F) {
67 		*target = src_char - 0xF000;
68 		return true;
69 	}
70 	switch (src_char) {
71 	case SFM_COLON:
72 		*target = ':';
73 		break;
74 	case SFM_DOUBLEQUOTE:
75 		*target = '"';
76 		break;
77 	case SFM_ASTERISK:
78 		*target = '*';
79 		break;
80 	case SFM_QUESTION:
81 		*target = '?';
82 		break;
83 	case SFM_PIPE:
84 		*target = '|';
85 		break;
86 	case SFM_GRTRTHAN:
87 		*target = '>';
88 		break;
89 	case SFM_LESSTHAN:
90 		*target = '<';
91 		break;
92 	case SFM_SPACE:
93 		*target = ' ';
94 		break;
95 	case SFM_PERIOD:
96 		*target = '.';
97 		break;
98 	default:
99 		return false;
100 	}
101 	return true;
102 }
103 
104 
105 /*
106  * cifs_mapchar - convert a host-endian char to proper char in codepage
107  * @target - where converted character should be copied
108  * @src_char - 2 byte host-endian source character
109  * @cp - codepage to which character should be converted
110  * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
111  *
112  * This function handles the conversion of a single character. It is the
113  * responsibility of the caller to ensure that the target buffer is large
114  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
115  */
116 static int
117 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
118 	     int maptype)
119 {
120 	int len = 1;
121 	__u16 src_char;
122 
123 	src_char = *from;
124 
125 	if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
126 		return len;
127 	else if ((maptype == SFU_MAP_UNI_RSVD) &&
128 		  convert_sfu_char(src_char, target))
129 		return len;
130 
131 	/* if character not one of seven in special remap set */
132 	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
133 	if (len <= 0)
134 		goto surrogate_pair;
135 
136 	return len;
137 
138 surrogate_pair:
139 	/* convert SURROGATE_PAIR and IVS */
140 	if (strcmp(cp->charset, "utf8"))
141 		goto unknown;
142 	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
143 	if (len <= 0)
144 		goto unknown;
145 	return len;
146 
147 unknown:
148 	*target = '?';
149 	len = 1;
150 	return len;
151 }
152 
153 /*
154  * cifs_from_utf16 - convert utf16le string to local charset
155  * @to - destination buffer
156  * @from - source buffer
157  * @tolen - destination buffer size (in bytes)
158  * @fromlen - source buffer size (in bytes)
159  * @codepage - codepage to which characters should be converted
160  * @mapchar - should characters be remapped according to the mapchars option?
161  *
162  * Convert a little-endian utf16le string (as sent by the server) to a string
163  * in the provided codepage. The tolen and fromlen parameters are to ensure
164  * that the code doesn't walk off of the end of the buffer (which is always
165  * a danger if the alignment of the source buffer is off). The destination
166  * string is always properly null terminated and fits in the destination
167  * buffer. Returns the length of the destination string in bytes (including
168  * null terminator).
169  *
170  * Note that some windows versions actually send multiword UTF-16 characters
171  * instead of straight UTF16-2. The linux nls routines however aren't able to
172  * deal with those characters properly. In the event that we get some of
173  * those characters, they won't be translated properly.
174  */
175 int
176 cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
177 		const struct nls_table *codepage, int map_type)
178 {
179 	int i, charlen, safelen;
180 	int outlen = 0;
181 	int nullsize = nls_nullsize(codepage);
182 	int fromwords = fromlen / 2;
183 	char tmp[NLS_MAX_CHARSET_SIZE];
184 	__u16 ftmp[3];		/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
185 
186 	/*
187 	 * because the chars can be of varying widths, we need to take care
188 	 * not to overflow the destination buffer when we get close to the
189 	 * end of it. Until we get to this offset, we don't need to check
190 	 * for overflow however.
191 	 */
192 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
193 
194 	for (i = 0; i < fromwords; i++) {
195 		ftmp[0] = get_unaligned_le16(&from[i]);
196 		if (ftmp[0] == 0)
197 			break;
198 		if (i + 1 < fromwords)
199 			ftmp[1] = get_unaligned_le16(&from[i + 1]);
200 		else
201 			ftmp[1] = 0;
202 		if (i + 2 < fromwords)
203 			ftmp[2] = get_unaligned_le16(&from[i + 2]);
204 		else
205 			ftmp[2] = 0;
206 
207 		/*
208 		 * check to see if converting this character might make the
209 		 * conversion bleed into the null terminator
210 		 */
211 		if (outlen >= safelen) {
212 			charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
213 			if ((outlen + charlen) > (tolen - nullsize))
214 				break;
215 		}
216 
217 		/* put converted char into 'to' buffer */
218 		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
219 		outlen += charlen;
220 
221 		/* charlen (=bytes of UTF-8 for 1 character)
222 		 * 4bytes UTF-8(surrogate pair) is charlen=4
223 		 *   (4bytes UTF-16 code)
224 		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
225 		 *   (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
226 		if (charlen == 4)
227 			i++;
228 		else if (charlen >= 5)
229 			/* 5-6bytes UTF-8 */
230 			i += 2;
231 	}
232 
233 	/* properly null-terminate string */
234 	for (i = 0; i < nullsize; i++)
235 		to[outlen++] = 0;
236 
237 	return outlen;
238 }
239 
240 /*
241  * NAME:	cifs_strtoUTF16()
242  *
243  * FUNCTION:	Convert character string to unicode string
244  *
245  */
246 int
247 cifs_strtoUTF16(__le16 *to, const char *from, int len,
248 	      const struct nls_table *codepage)
249 {
250 	int charlen;
251 	int i;
252 	wchar_t wchar_to; /* needed to quiet sparse */
253 
254 	/* special case for utf8 to handle no plane0 chars */
255 	if (!strcmp(codepage->charset, "utf8")) {
256 		/*
257 		 * convert utf8 -> utf16, we assume we have enough space
258 		 * as caller should have assumed conversion does not overflow
259 		 * in destination len is length in wchar_t units (16bits)
260 		 */
261 		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
262 				       (wchar_t *) to, len);
263 
264 		/* if success terminate and exit */
265 		if (i >= 0)
266 			goto success;
267 		/*
268 		 * if fails fall back to UCS encoding as this
269 		 * function should not return negative values
270 		 * currently can fail only if source contains
271 		 * invalid encoded characters
272 		 */
273 	}
274 
275 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
276 		charlen = codepage->char2uni(from, len, &wchar_to);
277 		if (charlen < 1) {
278 			cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
279 				 *from, charlen);
280 			/* A question mark */
281 			wchar_to = 0x003f;
282 			charlen = 1;
283 		}
284 		put_unaligned_le16(wchar_to, &to[i]);
285 	}
286 
287 success:
288 	put_unaligned_le16(0, &to[i]);
289 	return i;
290 }
291 
292 /*
293  * cifs_utf16_bytes - how long will a string be after conversion?
294  * @utf16 - pointer to input string
295  * @maxbytes - don't go past this many bytes of input string
296  * @codepage - destination codepage
297  *
298  * Walk a utf16le string and return the number of bytes that the string will
299  * be after being converted to the given charset, not including any null
300  * termination required. Don't walk past maxbytes in the source buffer.
301  */
302 int
303 cifs_utf16_bytes(const __le16 *from, int maxbytes,
304 		const struct nls_table *codepage)
305 {
306 	int i;
307 	int charlen, outlen = 0;
308 	int maxwords = maxbytes / 2;
309 	char tmp[NLS_MAX_CHARSET_SIZE];
310 	__u16 ftmp[3];
311 
312 	for (i = 0; i < maxwords; i++) {
313 		ftmp[0] = get_unaligned_le16(&from[i]);
314 		if (ftmp[0] == 0)
315 			break;
316 		if (i + 1 < maxwords)
317 			ftmp[1] = get_unaligned_le16(&from[i + 1]);
318 		else
319 			ftmp[1] = 0;
320 		if (i + 2 < maxwords)
321 			ftmp[2] = get_unaligned_le16(&from[i + 2]);
322 		else
323 			ftmp[2] = 0;
324 
325 		charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
326 		outlen += charlen;
327 	}
328 
329 	return outlen;
330 }
331 
332 /*
333  * cifs_strndup_from_utf16 - copy a string from wire format to the local
334  * codepage
335  * @src - source string
336  * @maxlen - don't walk past this many bytes in the source string
337  * @is_unicode - is this a unicode string?
338  * @codepage - destination codepage
339  *
340  * Take a string given by the server, convert it to the local codepage and
341  * put it in a new buffer. Returns a pointer to the new string or NULL on
342  * error.
343  */
344 char *
345 cifs_strndup_from_utf16(const char *src, const int maxlen,
346 			const bool is_unicode, const struct nls_table *codepage)
347 {
348 	int len;
349 	char *dst;
350 
351 	if (is_unicode) {
352 		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
353 		len += nls_nullsize(codepage);
354 		dst = kmalloc(len, GFP_KERNEL);
355 		if (!dst)
356 			return NULL;
357 		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
358 				NO_MAP_UNI_RSVD);
359 	} else {
360 		dst = kstrndup(src, maxlen, GFP_KERNEL);
361 	}
362 
363 	return dst;
364 }
365 
366 static __le16 convert_to_sfu_char(char src_char)
367 {
368 	__le16 dest_char;
369 
370 	switch (src_char) {
371 	case ':':
372 		dest_char = cpu_to_le16(UNI_COLON);
373 		break;
374 	case '*':
375 		dest_char = cpu_to_le16(UNI_ASTERISK);
376 		break;
377 	case '?':
378 		dest_char = cpu_to_le16(UNI_QUESTION);
379 		break;
380 	case '<':
381 		dest_char = cpu_to_le16(UNI_LESSTHAN);
382 		break;
383 	case '>':
384 		dest_char = cpu_to_le16(UNI_GRTRTHAN);
385 		break;
386 	case '|':
387 		dest_char = cpu_to_le16(UNI_PIPE);
388 		break;
389 	default:
390 		dest_char = 0;
391 	}
392 
393 	return dest_char;
394 }
395 
396 static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
397 {
398 	__le16 dest_char;
399 
400 	if (src_char >= 0x01 && src_char <= 0x1F) {
401 		dest_char = cpu_to_le16(src_char + 0xF000);
402 		return dest_char;
403 	}
404 	switch (src_char) {
405 	case ':':
406 		dest_char = cpu_to_le16(SFM_COLON);
407 		break;
408 	case '"':
409 		dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
410 		break;
411 	case '*':
412 		dest_char = cpu_to_le16(SFM_ASTERISK);
413 		break;
414 	case '?':
415 		dest_char = cpu_to_le16(SFM_QUESTION);
416 		break;
417 	case '<':
418 		dest_char = cpu_to_le16(SFM_LESSTHAN);
419 		break;
420 	case '>':
421 		dest_char = cpu_to_le16(SFM_GRTRTHAN);
422 		break;
423 	case '|':
424 		dest_char = cpu_to_le16(SFM_PIPE);
425 		break;
426 	case '.':
427 		if (end_of_string)
428 			dest_char = cpu_to_le16(SFM_PERIOD);
429 		else
430 			dest_char = 0;
431 		break;
432 	case ' ':
433 		if (end_of_string)
434 			dest_char = cpu_to_le16(SFM_SPACE);
435 		else
436 			dest_char = 0;
437 		break;
438 	default:
439 		dest_char = 0;
440 	}
441 
442 	return dest_char;
443 }
444 
445 /*
446  * Convert 16 bit Unicode pathname to wire format from string in current code
447  * page. Conversion may involve remapping up the six characters that are
448  * only legal in POSIX-like OS (if they are present in the string). Path
449  * names are little endian 16 bit Unicode on the wire
450  */
451 int
452 cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
453 		 const struct nls_table *cp, int map_chars)
454 {
455 	int i, charlen;
456 	int j = 0;
457 	char src_char;
458 	__le16 dst_char;
459 	wchar_t tmp;
460 	wchar_t *wchar_to;	/* UTF-16 */
461 	int ret;
462 	unicode_t u;
463 
464 	if (map_chars == NO_MAP_UNI_RSVD)
465 		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
466 
467 	wchar_to = kzalloc(6, GFP_KERNEL);
468 
469 	for (i = 0; i < srclen; j++) {
470 		src_char = source[i];
471 		charlen = 1;
472 
473 		/* check if end of string */
474 		if (src_char == 0)
475 			goto ctoUTF16_out;
476 
477 		/* see if we must remap this char */
478 		if (map_chars == SFU_MAP_UNI_RSVD)
479 			dst_char = convert_to_sfu_char(src_char);
480 		else if (map_chars == SFM_MAP_UNI_RSVD) {
481 			bool end_of_string;
482 
483 			/**
484 			 * Remap spaces and periods found at the end of every
485 			 * component of the path. The special cases of '.' and
486 			 * '..' are need to be handled because of symlinks.
487 			 * They are treated as non-end-of-string to avoid
488 			 * remapping and breaking symlinks pointing to . or ..
489 			 **/
490 			if ((i == 0 || source[i-1] == '\\') &&
491 			    source[i] == '.' &&
492 			    (i == srclen-1 || source[i+1] == '\\'))
493 				end_of_string = false; /* "." case */
494 			else if (i >= 1 &&
495 				 (i == 1 || source[i-2] == '\\') &&
496 				 source[i-1] == '.' &&
497 				 source[i] == '.' &&
498 				 (i == srclen-1 || source[i+1] == '\\'))
499 				end_of_string = false; /* ".." case */
500 			else if ((i == srclen - 1) || (source[i+1] == '\\'))
501 				end_of_string = true;
502 			else
503 				end_of_string = false;
504 
505 			dst_char = convert_to_sfm_char(src_char, end_of_string);
506 		} else
507 			dst_char = 0;
508 		/*
509 		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
510 		 * until all the calls to build_path_from_dentry are modified,
511 		 * as they use backslash as separator.
512 		 */
513 		if (dst_char == 0) {
514 			charlen = cp->char2uni(source + i, srclen - i, &tmp);
515 			dst_char = cpu_to_le16(tmp);
516 
517 			/*
518 			 * if no match, use question mark, which at least in
519 			 * some cases serves as wild card
520 			 */
521 			if (charlen > 0)
522 				goto ctoUTF16;
523 
524 			/* convert SURROGATE_PAIR */
525 			if (strcmp(cp->charset, "utf8") || !wchar_to)
526 				goto unknown;
527 			if (*(source + i) & 0x80) {
528 				charlen = utf8_to_utf32(source + i, 6, &u);
529 				if (charlen < 0)
530 					goto unknown;
531 			} else
532 				goto unknown;
533 			ret  = utf8s_to_utf16s(source + i, charlen,
534 					       UTF16_LITTLE_ENDIAN,
535 					       wchar_to, 6);
536 			if (ret < 0)
537 				goto unknown;
538 
539 			i += charlen;
540 			dst_char = cpu_to_le16(*wchar_to);
541 			if (charlen <= 3)
542 				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
543 				put_unaligned(dst_char, &target[j]);
544 			else if (charlen == 4) {
545 				/* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
546 				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
547 				 *   (charlen=3+4 or 4+4) */
548 				put_unaligned(dst_char, &target[j]);
549 				dst_char = cpu_to_le16(*(wchar_to + 1));
550 				j++;
551 				put_unaligned(dst_char, &target[j]);
552 			} else if (charlen >= 5) {
553 				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
554 				put_unaligned(dst_char, &target[j]);
555 				dst_char = cpu_to_le16(*(wchar_to + 1));
556 				j++;
557 				put_unaligned(dst_char, &target[j]);
558 				dst_char = cpu_to_le16(*(wchar_to + 2));
559 				j++;
560 				put_unaligned(dst_char, &target[j]);
561 			}
562 			continue;
563 
564 unknown:
565 			dst_char = cpu_to_le16(0x003f);
566 			charlen = 1;
567 		}
568 
569 ctoUTF16:
570 		/*
571 		 * character may take more than one byte in the source string,
572 		 * but will take exactly two bytes in the target string
573 		 */
574 		i += charlen;
575 		put_unaligned(dst_char, &target[j]);
576 	}
577 
578 ctoUTF16_out:
579 	put_unaligned(0, &target[j]); /* Null terminate target unicode string */
580 	kfree(wchar_to);
581 	return j;
582 }
583 
584 /*
585  * cifs_local_to_utf16_bytes - how long will a string be after conversion?
586  * @from - pointer to input string
587  * @maxbytes - don't go past this many bytes of input string
588  * @codepage - source codepage
589  *
590  * Walk a string and return the number of bytes that the string will
591  * be after being converted to the given charset, not including any null
592  * termination required. Don't walk past maxbytes in the source buffer.
593  */
594 
595 static int
596 cifs_local_to_utf16_bytes(const char *from, int len,
597 			  const struct nls_table *codepage)
598 {
599 	int charlen;
600 	int i;
601 	wchar_t wchar_to;
602 
603 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
604 		charlen = codepage->char2uni(from, len, &wchar_to);
605 		/* Failed conversion defaults to a question mark */
606 		if (charlen < 1)
607 			charlen = 1;
608 	}
609 	return 2 * i; /* UTF16 characters are two bytes */
610 }
611 
612 /*
613  * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
614  * @src - source string
615  * @maxlen - don't walk past this many bytes in the source string
616  * @utf16_len - the length of the allocated string in bytes (including null)
617  * @cp - source codepage
618  * @remap - map special chars
619  *
620  * Take a string convert it from the local codepage to UTF16 and
621  * put it in a new buffer. Returns a pointer to the new string or NULL on
622  * error.
623  */
624 __le16 *
625 cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
626 		      const struct nls_table *cp, int remap)
627 {
628 	int len;
629 	__le16 *dst;
630 
631 	if (!src)
632 		return NULL;
633 
634 	len = cifs_local_to_utf16_bytes(src, maxlen, cp);
635 	len += 2; /* NULL */
636 	dst = kmalloc(len, GFP_KERNEL);
637 	if (!dst) {
638 		*utf16_len = 0;
639 		return NULL;
640 	}
641 	cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
642 	*utf16_len = len;
643 	return dst;
644 }
645