xref: /linux/fs/smb/client/cifs_unicode.c (revision 0f7e753fc3851aac8aeea6b551cbbcf6ca9093dd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *
4  *   Copyright (c) International Business Machines  Corp., 2000,2009
5  *   Modified by Steve French (sfrench@us.ibm.com)
6  */
7 #include <linux/fs.h>
8 #include <linux/slab.h>
9 #include "cifs_fs_sb.h"
10 #include "cifs_unicode.h"
11 #include "cifspdu.h"
12 #include "cifsglob.h"
13 #include "cifs_debug.h"
14 
15 int cifs_remap(struct cifs_sb_info *cifs_sb)
16 {
17 	int map_type;
18 
19 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
20 		map_type = SFM_MAP_UNI_RSVD;
21 	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
22 		map_type = SFU_MAP_UNI_RSVD;
23 	else
24 		map_type = NO_MAP_UNI_RSVD;
25 
26 	return map_type;
27 }
28 
29 /* Convert character using the SFU - "Services for Unix" remapping range */
30 static bool
31 convert_sfu_char(const __u16 src_char, char *target)
32 {
33 	/*
34 	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
35 	 *     build_path_from_dentry are modified, as they use slash as
36 	 *     separator.
37 	 */
38 	switch (src_char) {
39 	case UNI_COLON:
40 		*target = ':';
41 		break;
42 	case UNI_ASTERISK:
43 		*target = '*';
44 		break;
45 	case UNI_QUESTION:
46 		*target = '?';
47 		break;
48 	case UNI_PIPE:
49 		*target = '|';
50 		break;
51 	case UNI_GRTRTHAN:
52 		*target = '>';
53 		break;
54 	case UNI_LESSTHAN:
55 		*target = '<';
56 		break;
57 	default:
58 		return false;
59 	}
60 	return true;
61 }
62 
63 /* Convert character using the SFM - "Services for Mac" remapping range */
64 static bool
65 convert_sfm_char(const __u16 src_char, char *target)
66 {
67 	if (src_char >= 0xF001 && src_char <= 0xF01F) {
68 		*target = src_char - 0xF000;
69 		return true;
70 	}
71 	switch (src_char) {
72 	case SFM_COLON:
73 		*target = ':';
74 		break;
75 	case SFM_DOUBLEQUOTE:
76 		*target = '"';
77 		break;
78 	case SFM_ASTERISK:
79 		*target = '*';
80 		break;
81 	case SFM_QUESTION:
82 		*target = '?';
83 		break;
84 	case SFM_PIPE:
85 		*target = '|';
86 		break;
87 	case SFM_GRTRTHAN:
88 		*target = '>';
89 		break;
90 	case SFM_LESSTHAN:
91 		*target = '<';
92 		break;
93 	case SFM_SPACE:
94 		*target = ' ';
95 		break;
96 	case SFM_PERIOD:
97 		*target = '.';
98 		break;
99 	default:
100 		return false;
101 	}
102 	return true;
103 }
104 
105 
106 /*
107  * cifs_mapchar - convert a host-endian char to proper char in codepage
108  * @target - where converted character should be copied
109  * @src_char - 2 byte host-endian source character
110  * @cp - codepage to which character should be converted
111  * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
112  *
113  * This function handles the conversion of a single character. It is the
114  * responsibility of the caller to ensure that the target buffer is large
115  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
116  */
117 static int
118 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
119 	     int maptype)
120 {
121 	int len = 1;
122 	__u16 src_char;
123 
124 	src_char = *from;
125 
126 	if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
127 		return len;
128 	else if ((maptype == SFU_MAP_UNI_RSVD) &&
129 		  convert_sfu_char(src_char, target))
130 		return len;
131 
132 	/* if character not one of seven in special remap set */
133 	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
134 	if (len <= 0)
135 		goto surrogate_pair;
136 
137 	return len;
138 
139 surrogate_pair:
140 	/* convert SURROGATE_PAIR and IVS */
141 	if (strcmp(cp->charset, "utf8"))
142 		goto unknown;
143 	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
144 	if (len <= 0)
145 		goto unknown;
146 	return len;
147 
148 unknown:
149 	*target = '?';
150 	len = 1;
151 	return len;
152 }
153 
154 /*
155  * cifs_from_utf16 - convert utf16le string to local charset
156  * @to - destination buffer
157  * @from - source buffer
158  * @tolen - destination buffer size (in bytes)
159  * @fromlen - source buffer size (in bytes)
160  * @codepage - codepage to which characters should be converted
161  * @mapchar - should characters be remapped according to the mapchars option?
162  *
163  * Convert a little-endian utf16le string (as sent by the server) to a string
164  * in the provided codepage. The tolen and fromlen parameters are to ensure
165  * that the code doesn't walk off of the end of the buffer (which is always
166  * a danger if the alignment of the source buffer is off). The destination
167  * string is always properly null terminated and fits in the destination
168  * buffer. Returns the length of the destination string in bytes (including
169  * null terminator).
170  *
171  * Note that some windows versions actually send multiword UTF-16 characters
172  * instead of straight UTF16-2. The linux nls routines however aren't able to
173  * deal with those characters properly. In the event that we get some of
174  * those characters, they won't be translated properly.
175  */
176 int
177 cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
178 		const struct nls_table *codepage, int map_type)
179 {
180 	int i, charlen, safelen;
181 	int outlen = 0;
182 	int nullsize = nls_nullsize(codepage);
183 	int fromwords = fromlen / 2;
184 	char tmp[NLS_MAX_CHARSET_SIZE];
185 	__u16 ftmp[3];		/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
186 
187 	/*
188 	 * because the chars can be of varying widths, we need to take care
189 	 * not to overflow the destination buffer when we get close to the
190 	 * end of it. Until we get to this offset, we don't need to check
191 	 * for overflow however.
192 	 */
193 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
194 
195 	for (i = 0; i < fromwords; i++) {
196 		ftmp[0] = get_unaligned_le16(&from[i]);
197 		if (ftmp[0] == 0)
198 			break;
199 		if (i + 1 < fromwords)
200 			ftmp[1] = get_unaligned_le16(&from[i + 1]);
201 		else
202 			ftmp[1] = 0;
203 		if (i + 2 < fromwords)
204 			ftmp[2] = get_unaligned_le16(&from[i + 2]);
205 		else
206 			ftmp[2] = 0;
207 
208 		/*
209 		 * check to see if converting this character might make the
210 		 * conversion bleed into the null terminator
211 		 */
212 		if (outlen >= safelen) {
213 			charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
214 			if ((outlen + charlen) > (tolen - nullsize))
215 				break;
216 		}
217 
218 		/* put converted char into 'to' buffer */
219 		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
220 		outlen += charlen;
221 
222 		/* charlen (=bytes of UTF-8 for 1 character)
223 		 * 4bytes UTF-8(surrogate pair) is charlen=4
224 		 *   (4bytes UTF-16 code)
225 		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
226 		 *   (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
227 		if (charlen == 4)
228 			i++;
229 		else if (charlen >= 5)
230 			/* 5-6bytes UTF-8 */
231 			i += 2;
232 	}
233 
234 	/* properly null-terminate string */
235 	for (i = 0; i < nullsize; i++)
236 		to[outlen++] = 0;
237 
238 	return outlen;
239 }
240 
241 /*
242  * NAME:	cifs_strtoUTF16()
243  *
244  * FUNCTION:	Convert character string to unicode string
245  *
246  */
247 int
248 cifs_strtoUTF16(__le16 *to, const char *from, int len,
249 	      const struct nls_table *codepage)
250 {
251 	int charlen;
252 	int i;
253 	wchar_t wchar_to; /* needed to quiet sparse */
254 
255 	/* special case for utf8 to handle no plane0 chars */
256 	if (!strcmp(codepage->charset, "utf8")) {
257 		/*
258 		 * convert utf8 -> utf16, we assume we have enough space
259 		 * as caller should have assumed conversion does not overflow
260 		 * in destination len is length in wchar_t units (16bits)
261 		 */
262 		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
263 				       (wchar_t *) to, len);
264 
265 		/* if success terminate and exit */
266 		if (i >= 0)
267 			goto success;
268 		/*
269 		 * if fails fall back to UCS encoding as this
270 		 * function should not return negative values
271 		 * currently can fail only if source contains
272 		 * invalid encoded characters
273 		 */
274 	}
275 
276 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
277 		charlen = codepage->char2uni(from, len, &wchar_to);
278 		if (charlen < 1) {
279 			cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
280 				 *from, charlen);
281 			/* A question mark */
282 			wchar_to = 0x003f;
283 			charlen = 1;
284 		}
285 		put_unaligned_le16(wchar_to, &to[i]);
286 	}
287 
288 success:
289 	put_unaligned_le16(0, &to[i]);
290 	return i;
291 }
292 
293 /*
294  * cifs_utf16_bytes - how long will a string be after conversion?
295  * @utf16 - pointer to input string
296  * @maxbytes - don't go past this many bytes of input string
297  * @codepage - destination codepage
298  *
299  * Walk a utf16le string and return the number of bytes that the string will
300  * be after being converted to the given charset, not including any null
301  * termination required. Don't walk past maxbytes in the source buffer.
302  */
303 int
304 cifs_utf16_bytes(const __le16 *from, int maxbytes,
305 		const struct nls_table *codepage)
306 {
307 	int i;
308 	int charlen, outlen = 0;
309 	int maxwords = maxbytes / 2;
310 	char tmp[NLS_MAX_CHARSET_SIZE];
311 	__u16 ftmp[3];
312 
313 	for (i = 0; i < maxwords; i++) {
314 		ftmp[0] = get_unaligned_le16(&from[i]);
315 		if (ftmp[0] == 0)
316 			break;
317 		if (i + 1 < maxwords)
318 			ftmp[1] = get_unaligned_le16(&from[i + 1]);
319 		else
320 			ftmp[1] = 0;
321 		if (i + 2 < maxwords)
322 			ftmp[2] = get_unaligned_le16(&from[i + 2]);
323 		else
324 			ftmp[2] = 0;
325 
326 		charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
327 		outlen += charlen;
328 	}
329 
330 	return outlen;
331 }
332 
333 /*
334  * cifs_strndup_from_utf16 - copy a string from wire format to the local
335  * codepage
336  * @src - source string
337  * @maxlen - don't walk past this many bytes in the source string
338  * @is_unicode - is this a unicode string?
339  * @codepage - destination codepage
340  *
341  * Take a string given by the server, convert it to the local codepage and
342  * put it in a new buffer. Returns a pointer to the new string or NULL on
343  * error.
344  */
345 char *
346 cifs_strndup_from_utf16(const char *src, const int maxlen,
347 			const bool is_unicode, const struct nls_table *codepage)
348 {
349 	int len;
350 	char *dst;
351 
352 	if (is_unicode) {
353 		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
354 		len += nls_nullsize(codepage);
355 		dst = kmalloc(len, GFP_KERNEL);
356 		if (!dst)
357 			return NULL;
358 		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
359 				NO_MAP_UNI_RSVD);
360 	} else {
361 		dst = kstrndup(src, maxlen, GFP_KERNEL);
362 	}
363 
364 	return dst;
365 }
366 
367 static __le16 convert_to_sfu_char(char src_char)
368 {
369 	__le16 dest_char;
370 
371 	switch (src_char) {
372 	case ':':
373 		dest_char = cpu_to_le16(UNI_COLON);
374 		break;
375 	case '*':
376 		dest_char = cpu_to_le16(UNI_ASTERISK);
377 		break;
378 	case '?':
379 		dest_char = cpu_to_le16(UNI_QUESTION);
380 		break;
381 	case '<':
382 		dest_char = cpu_to_le16(UNI_LESSTHAN);
383 		break;
384 	case '>':
385 		dest_char = cpu_to_le16(UNI_GRTRTHAN);
386 		break;
387 	case '|':
388 		dest_char = cpu_to_le16(UNI_PIPE);
389 		break;
390 	default:
391 		dest_char = 0;
392 	}
393 
394 	return dest_char;
395 }
396 
397 static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
398 {
399 	__le16 dest_char;
400 
401 	if (src_char >= 0x01 && src_char <= 0x1F) {
402 		dest_char = cpu_to_le16(src_char + 0xF000);
403 		return dest_char;
404 	}
405 	switch (src_char) {
406 	case ':':
407 		dest_char = cpu_to_le16(SFM_COLON);
408 		break;
409 	case '"':
410 		dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
411 		break;
412 	case '*':
413 		dest_char = cpu_to_le16(SFM_ASTERISK);
414 		break;
415 	case '?':
416 		dest_char = cpu_to_le16(SFM_QUESTION);
417 		break;
418 	case '<':
419 		dest_char = cpu_to_le16(SFM_LESSTHAN);
420 		break;
421 	case '>':
422 		dest_char = cpu_to_le16(SFM_GRTRTHAN);
423 		break;
424 	case '|':
425 		dest_char = cpu_to_le16(SFM_PIPE);
426 		break;
427 	case '.':
428 		if (end_of_string)
429 			dest_char = cpu_to_le16(SFM_PERIOD);
430 		else
431 			dest_char = 0;
432 		break;
433 	case ' ':
434 		if (end_of_string)
435 			dest_char = cpu_to_le16(SFM_SPACE);
436 		else
437 			dest_char = 0;
438 		break;
439 	default:
440 		dest_char = 0;
441 	}
442 
443 	return dest_char;
444 }
445 
446 /*
447  * Convert 16 bit Unicode pathname to wire format from string in current code
448  * page. Conversion may involve remapping up the six characters that are
449  * only legal in POSIX-like OS (if they are present in the string). Path
450  * names are little endian 16 bit Unicode on the wire
451  */
452 int
453 cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
454 		 const struct nls_table *cp, int map_chars)
455 {
456 	int i, charlen;
457 	int j = 0;
458 	char src_char;
459 	__le16 dst_char;
460 	wchar_t tmp;
461 	wchar_t *wchar_to;	/* UTF-16 */
462 	int ret;
463 	unicode_t u;
464 
465 	if (map_chars == NO_MAP_UNI_RSVD)
466 		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
467 
468 	wchar_to = kzalloc(6, GFP_KERNEL);
469 
470 	for (i = 0; i < srclen; j++) {
471 		src_char = source[i];
472 		charlen = 1;
473 
474 		/* check if end of string */
475 		if (src_char == 0)
476 			goto ctoUTF16_out;
477 
478 		/* see if we must remap this char */
479 		if (map_chars == SFU_MAP_UNI_RSVD)
480 			dst_char = convert_to_sfu_char(src_char);
481 		else if (map_chars == SFM_MAP_UNI_RSVD) {
482 			bool end_of_string;
483 
484 			/**
485 			 * Remap spaces and periods found at the end of every
486 			 * component of the path. The special cases of '.' and
487 			 * '..' do not need to be dealt with explicitly because
488 			 * they are addressed in namei.c:link_path_walk().
489 			 **/
490 			if ((i == srclen - 1) || (source[i+1] == '\\'))
491 				end_of_string = true;
492 			else
493 				end_of_string = false;
494 
495 			dst_char = convert_to_sfm_char(src_char, end_of_string);
496 		} else
497 			dst_char = 0;
498 		/*
499 		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
500 		 * until all the calls to build_path_from_dentry are modified,
501 		 * as they use backslash as separator.
502 		 */
503 		if (dst_char == 0) {
504 			charlen = cp->char2uni(source + i, srclen - i, &tmp);
505 			dst_char = cpu_to_le16(tmp);
506 
507 			/*
508 			 * if no match, use question mark, which at least in
509 			 * some cases serves as wild card
510 			 */
511 			if (charlen > 0)
512 				goto ctoUTF16;
513 
514 			/* convert SURROGATE_PAIR */
515 			if (strcmp(cp->charset, "utf8") || !wchar_to)
516 				goto unknown;
517 			if (*(source + i) & 0x80) {
518 				charlen = utf8_to_utf32(source + i, 6, &u);
519 				if (charlen < 0)
520 					goto unknown;
521 			} else
522 				goto unknown;
523 			ret  = utf8s_to_utf16s(source + i, charlen,
524 					       UTF16_LITTLE_ENDIAN,
525 					       wchar_to, 6);
526 			if (ret < 0)
527 				goto unknown;
528 
529 			i += charlen;
530 			dst_char = cpu_to_le16(*wchar_to);
531 			if (charlen <= 3)
532 				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
533 				put_unaligned(dst_char, &target[j]);
534 			else if (charlen == 4) {
535 				/* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
536 				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
537 				 *   (charlen=3+4 or 4+4) */
538 				put_unaligned(dst_char, &target[j]);
539 				dst_char = cpu_to_le16(*(wchar_to + 1));
540 				j++;
541 				put_unaligned(dst_char, &target[j]);
542 			} else if (charlen >= 5) {
543 				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
544 				put_unaligned(dst_char, &target[j]);
545 				dst_char = cpu_to_le16(*(wchar_to + 1));
546 				j++;
547 				put_unaligned(dst_char, &target[j]);
548 				dst_char = cpu_to_le16(*(wchar_to + 2));
549 				j++;
550 				put_unaligned(dst_char, &target[j]);
551 			}
552 			continue;
553 
554 unknown:
555 			dst_char = cpu_to_le16(0x003f);
556 			charlen = 1;
557 		}
558 
559 ctoUTF16:
560 		/*
561 		 * character may take more than one byte in the source string,
562 		 * but will take exactly two bytes in the target string
563 		 */
564 		i += charlen;
565 		put_unaligned(dst_char, &target[j]);
566 	}
567 
568 ctoUTF16_out:
569 	put_unaligned(0, &target[j]); /* Null terminate target unicode string */
570 	kfree(wchar_to);
571 	return j;
572 }
573 
574 /*
575  * cifs_local_to_utf16_bytes - how long will a string be after conversion?
576  * @from - pointer to input string
577  * @maxbytes - don't go past this many bytes of input string
578  * @codepage - source codepage
579  *
580  * Walk a string and return the number of bytes that the string will
581  * be after being converted to the given charset, not including any null
582  * termination required. Don't walk past maxbytes in the source buffer.
583  */
584 
585 static int
586 cifs_local_to_utf16_bytes(const char *from, int len,
587 			  const struct nls_table *codepage)
588 {
589 	int charlen;
590 	int i;
591 	wchar_t wchar_to;
592 
593 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
594 		charlen = codepage->char2uni(from, len, &wchar_to);
595 		/* Failed conversion defaults to a question mark */
596 		if (charlen < 1)
597 			charlen = 1;
598 	}
599 	return 2 * i; /* UTF16 characters are two bytes */
600 }
601 
602 /*
603  * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
604  * @src - source string
605  * @maxlen - don't walk past this many bytes in the source string
606  * @utf16_len - the length of the allocated string in bytes (including null)
607  * @cp - source codepage
608  * @remap - map special chars
609  *
610  * Take a string convert it from the local codepage to UTF16 and
611  * put it in a new buffer. Returns a pointer to the new string or NULL on
612  * error.
613  */
614 __le16 *
615 cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
616 		      const struct nls_table *cp, int remap)
617 {
618 	int len;
619 	__le16 *dst;
620 
621 	len = cifs_local_to_utf16_bytes(src, maxlen, cp);
622 	len += 2; /* NULL */
623 	dst = kmalloc(len, GFP_KERNEL);
624 	if (!dst) {
625 		*utf16_len = 0;
626 		return NULL;
627 	}
628 	cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
629 	*utf16_len = len;
630 	return dst;
631 }
632