xref: /linux/fs/smb/server/unicode.c (revision 70d7f7dbd98a4d499b46ec9ef2bd1f2698facf2b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Some of the source code in this file came from fs/cifs/cifs_unicode.c
4  *
5  *   Copyright (c) International Business Machines  Corp., 2000,2009
6  *   Modified by Steve French (sfrench@us.ibm.com)
7  *   Modified by Namjae Jeon (linkinjeon@kernel.org)
8  */
9 #include <linux/fs.h>
10 #include <linux/slab.h>
11 #include <linux/unaligned.h>
12 #include "glob.h"
13 #include "unicode.h"
14 #include "smb_common.h"
15 
16 /*
17  * cifs_mapchar() - convert a host-endian char to proper char in codepage
18  * @target:	where converted character should be copied
19  * @from:	host-endian source string
20  * @cp:		codepage to which character should be converted
21  * @mapchar:	should character be mapped according to mapchars mount option?
22  *
23  * This function handles the conversion of a single character. It is the
24  * responsibility of the caller to ensure that the target buffer is large
25  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
26  *
27  * Return:	string length after conversion
28  */
29 static int
30 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
31 	     bool mapchar)
32 {
33 	int len = 1;
34 	__u16 src_char;
35 
36 	src_char = *from;
37 
38 	if (!mapchar)
39 		goto cp_convert;
40 
41 	/*
42 	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
43 	 *     build_path_from_dentry are modified, as they use slash as
44 	 *     separator.
45 	 */
46 	switch (src_char) {
47 	case UNI_COLON:
48 		*target = ':';
49 		break;
50 	case UNI_ASTERISK:
51 		*target = '*';
52 		break;
53 	case UNI_QUESTION:
54 		*target = '?';
55 		break;
56 	case UNI_PIPE:
57 		*target = '|';
58 		break;
59 	case UNI_GRTRTHAN:
60 		*target = '>';
61 		break;
62 	case UNI_LESSTHAN:
63 		*target = '<';
64 		break;
65 	default:
66 		goto cp_convert;
67 	}
68 
69 out:
70 	return len;
71 
72 cp_convert:
73 	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
74 	if (len <= 0)
75 		goto surrogate_pair;
76 
77 	goto out;
78 
79 surrogate_pair:
80 	/* convert SURROGATE_PAIR and IVS */
81 	if (strcmp(cp->charset, "utf8"))
82 		goto unknown;
83 	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
84 	if (len <= 0)
85 		goto unknown;
86 	return len;
87 
88 unknown:
89 	*target = '?';
90 	len = 1;
91 	goto out;
92 }
93 
94 /*
95  * smb_utf16_bytes() - compute converted string length
96  * @from:	pointer to input string
97  * @maxbytes:	input string length
98  * @codepage:	destination codepage
99  *
100  * Walk a utf16le string and return the number of bytes that the string will
101  * be after being converted to the given charset, not including any null
102  * termination required. Don't walk past maxbytes in the source buffer.
103  *
104  * Return:	string length after conversion
105  */
106 static int smb_utf16_bytes(const __le16 *from, int maxbytes,
107 			   const struct nls_table *codepage)
108 {
109 	int i, j;
110 	int charlen, outlen = 0;
111 	int maxwords = maxbytes / 2;
112 	char tmp[NLS_MAX_CHARSET_SIZE];
113 	__u16 ftmp[3];
114 
115 	for (i = 0; i < maxwords; i++) {
116 		ftmp[0] = get_unaligned_le16(&from[i]);
117 		if (ftmp[0] == 0)
118 			break;
119 		for (j = 1; j <= 2; j++) {
120 			if (i + j < maxwords)
121 				ftmp[j] = get_unaligned_le16(&from[i + j]);
122 			else
123 				ftmp[j] = 0;
124 		}
125 
126 		charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
127 		if (charlen > 0)
128 			outlen += charlen;
129 		else
130 			outlen++;
131 	}
132 
133 	return outlen;
134 }
135 
136 /*
137  * smb_from_utf16() - convert utf16le string to local charset
138  * @to:		destination buffer
139  * @from:	source buffer
140  * @tolen:	destination buffer size (in bytes)
141  * @fromlen:	source buffer size (in bytes)
142  * @codepage:	codepage to which characters should be converted
143  * @mapchar:	should characters be remapped according to the mapchars option?
144  *
145  * Convert a little-endian utf16le string (as sent by the server) to a string
146  * in the provided codepage. The tolen and fromlen parameters are to ensure
147  * that the code doesn't walk off of the end of the buffer (which is always
148  * a danger if the alignment of the source buffer is off). The destination
149  * string is always properly null terminated and fits in the destination
150  * buffer. Returns the length of the destination string in bytes (including
151  * null terminator).
152  *
153  * Note that some windows versions actually send multiword UTF-16 characters
154  * instead of straight UTF16-2. The linux nls routines however aren't able to
155  * deal with those characters properly. In the event that we get some of
156  * those characters, they won't be translated properly.
157  *
158  * Return:	string length after conversion
159  */
160 static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
161 			  const struct nls_table *codepage, bool mapchar)
162 {
163 	int i, j, charlen, safelen;
164 	int outlen = 0;
165 	int nullsize = nls_nullsize(codepage);
166 	int fromwords = fromlen / 2;
167 	char tmp[NLS_MAX_CHARSET_SIZE];
168 	__u16 ftmp[3];	/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
169 
170 	/*
171 	 * because the chars can be of varying widths, we need to take care
172 	 * not to overflow the destination buffer when we get close to the
173 	 * end of it. Until we get to this offset, we don't need to check
174 	 * for overflow however.
175 	 */
176 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
177 
178 	for (i = 0; i < fromwords; i++) {
179 		ftmp[0] = get_unaligned_le16(&from[i]);
180 		if (ftmp[0] == 0)
181 			break;
182 		for (j = 1; j <= 2; j++) {
183 			if (i + j < fromwords)
184 				ftmp[j] = get_unaligned_le16(&from[i + j]);
185 			else
186 				ftmp[j] = 0;
187 		}
188 
189 		/*
190 		 * check to see if converting this character might make the
191 		 * conversion bleed into the null terminator
192 		 */
193 		if (outlen >= safelen) {
194 			charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
195 			if ((outlen + charlen) > (tolen - nullsize))
196 				break;
197 		}
198 
199 		/* put converted char into 'to' buffer */
200 		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
201 		outlen += charlen;
202 
203 		/*
204 		 * charlen (=bytes of UTF-8 for 1 character)
205 		 * 4bytes UTF-8(surrogate pair) is charlen=4
206 		 * (4bytes UTF-16 code)
207 		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
208 		 * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
209 		 */
210 		if (charlen == 4)
211 			i++;
212 		else if (charlen >= 5)
213 			/* 5-6bytes UTF-8 */
214 			i += 2;
215 	}
216 
217 	/* properly null-terminate string */
218 	for (i = 0; i < nullsize; i++)
219 		to[outlen++] = 0;
220 
221 	return outlen;
222 }
223 
224 /*
225  * smb_strtoUTF16() - Convert character string to unicode string
226  * @to:		destination buffer
227  * @from:	source buffer
228  * @len:	destination buffer size (in bytes)
229  * @codepage:	codepage to which characters should be converted
230  *
231  * Return:	string length after conversion
232  */
233 int smb_strtoUTF16(__le16 *to, const char *from, int len,
234 		   const struct nls_table *codepage)
235 {
236 	int charlen;
237 	int i;
238 	wchar_t wchar_to; /* needed to quiet sparse */
239 
240 	/* special case for utf8 to handle no plane0 chars */
241 	if (!strcmp(codepage->charset, "utf8")) {
242 		/*
243 		 * convert utf8 -> utf16, we assume we have enough space
244 		 * as caller should have assumed conversion does not overflow
245 		 * in destination len is length in wchar_t units (16bits)
246 		 */
247 		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
248 				     (wchar_t *)to, len);
249 
250 		/* if success terminate and exit */
251 		if (i >= 0)
252 			goto success;
253 		/*
254 		 * if fails fall back to UCS encoding as this
255 		 * function should not return negative values
256 		 * currently can fail only if source contains
257 		 * invalid encoded characters
258 		 */
259 	}
260 
261 	for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) {
262 		charlen = codepage->char2uni(from, len, &wchar_to);
263 		if (charlen < 1) {
264 			/* A question mark */
265 			wchar_to = 0x003f;
266 			charlen = 1;
267 		}
268 		put_unaligned_le16(wchar_to, &to[i]);
269 	}
270 
271 success:
272 	put_unaligned_le16(0, &to[i]);
273 	return i;
274 }
275 
276 /*
277  * smb_strndup_from_utf16() - copy a string from wire format to the local
278  *		codepage
279  * @src:	source string
280  * @maxlen:	don't walk past this many bytes in the source string
281  * @is_unicode:	is this a unicode string?
282  * @codepage:	destination codepage
283  *
284  * Take a string given by the server, convert it to the local codepage and
285  * put it in a new buffer. Returns a pointer to the new string or NULL on
286  * error.
287  *
288  * Return:	destination string buffer or error ptr
289  */
290 char *smb_strndup_from_utf16(const char *src, const int maxlen,
291 			     const bool is_unicode,
292 			     const struct nls_table *codepage)
293 {
294 	int len, ret;
295 	char *dst;
296 
297 	if (is_unicode) {
298 		len = smb_utf16_bytes((__le16 *)src, maxlen, codepage);
299 		len += nls_nullsize(codepage);
300 		dst = kmalloc(len, GFP_KERNEL);
301 		if (!dst)
302 			return ERR_PTR(-ENOMEM);
303 		ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage,
304 				     false);
305 		if (ret < 0) {
306 			kfree(dst);
307 			return ERR_PTR(-EINVAL);
308 		}
309 	} else {
310 		len = strnlen(src, maxlen);
311 		len++;
312 		dst = kmalloc(len, GFP_KERNEL);
313 		if (!dst)
314 			return ERR_PTR(-ENOMEM);
315 		strscpy(dst, src, len);
316 	}
317 
318 	return dst;
319 }
320 
321 /*
322  * Convert 16 bit Unicode pathname to wire format from string in current code
323  * page. Conversion may involve remapping up the six characters that are
324  * only legal in POSIX-like OS (if they are present in the string). Path
325  * names are little endian 16 bit Unicode on the wire
326  */
327 /*
328  * smbConvertToUTF16() - convert string from local charset to utf16
329  * @target:	destination buffer
330  * @source:	source buffer
331  * @srclen:	source buffer size (in bytes)
332  * @cp:		codepage to which characters should be converted
333  * @mapchar:	should characters be remapped according to the mapchars option?
334  *
335  * Convert 16 bit Unicode pathname to wire format from string in current code
336  * page. Conversion may involve remapping up the six characters that are
337  * only legal in POSIX-like OS (if they are present in the string). Path
338  * names are little endian 16 bit Unicode on the wire
339  *
340  * Return:	char length after conversion
341  */
342 int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
343 		      const struct nls_table *cp, int mapchars)
344 {
345 	int i, j, charlen;
346 	char src_char;
347 	__le16 dst_char;
348 	wchar_t tmp;
349 	wchar_t wchar_to[6];	/* UTF-16 */
350 	int ret;
351 	unicode_t u;
352 
353 	if (!mapchars)
354 		return smb_strtoUTF16(target, source, srclen, cp);
355 
356 	for (i = 0, j = 0; i < srclen; j++) {
357 		src_char = source[i];
358 		charlen = 1;
359 		switch (src_char) {
360 		case 0:
361 			put_unaligned(0, &target[j]);
362 			return j;
363 		case ':':
364 			dst_char = cpu_to_le16(UNI_COLON);
365 			break;
366 		case '*':
367 			dst_char = cpu_to_le16(UNI_ASTERISK);
368 			break;
369 		case '?':
370 			dst_char = cpu_to_le16(UNI_QUESTION);
371 			break;
372 		case '<':
373 			dst_char = cpu_to_le16(UNI_LESSTHAN);
374 			break;
375 		case '>':
376 			dst_char = cpu_to_le16(UNI_GRTRTHAN);
377 			break;
378 		case '|':
379 			dst_char = cpu_to_le16(UNI_PIPE);
380 			break;
381 		/*
382 		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
383 		 * until all the calls to build_path_from_dentry are modified,
384 		 * as they use backslash as separator.
385 		 */
386 		default:
387 			charlen = cp->char2uni(source + i, srclen - i, &tmp);
388 			dst_char = cpu_to_le16(tmp);
389 
390 			/*
391 			 * if no match, use question mark, which at least in
392 			 * some cases serves as wild card
393 			 */
394 			if (charlen > 0)
395 				goto ctoUTF16;
396 
397 			/* convert SURROGATE_PAIR */
398 			if (strcmp(cp->charset, "utf8"))
399 				goto unknown;
400 			if (*(source + i) & 0x80) {
401 				charlen = utf8_to_utf32(source + i, 6, &u);
402 				if (charlen < 0)
403 					goto unknown;
404 			} else
405 				goto unknown;
406 			ret  = utf8s_to_utf16s(source + i, charlen,
407 					UTF16_LITTLE_ENDIAN,
408 					wchar_to, 6);
409 			if (ret < 0)
410 				goto unknown;
411 
412 			i += charlen;
413 			dst_char = cpu_to_le16(*wchar_to);
414 			if (charlen <= 3)
415 				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
416 				put_unaligned(dst_char, &target[j]);
417 			else if (charlen == 4) {
418 				/*
419 				 * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
420 				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
421 				 * (charlen=3+4 or 4+4)
422 				 */
423 				put_unaligned(dst_char, &target[j]);
424 				dst_char = cpu_to_le16(*(wchar_to + 1));
425 				j++;
426 				put_unaligned(dst_char, &target[j]);
427 			} else if (charlen >= 5) {
428 				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
429 				put_unaligned(dst_char, &target[j]);
430 				dst_char = cpu_to_le16(*(wchar_to + 1));
431 				j++;
432 				put_unaligned(dst_char, &target[j]);
433 				dst_char = cpu_to_le16(*(wchar_to + 2));
434 				j++;
435 				put_unaligned(dst_char, &target[j]);
436 			}
437 			continue;
438 
439 unknown:
440 			dst_char = cpu_to_le16(0x003f);
441 			charlen = 1;
442 		}
443 
444 ctoUTF16:
445 		/*
446 		 * character may take more than one byte in the source string,
447 		 * but will take exactly two bytes in the target string
448 		 */
449 		i += charlen;
450 		put_unaligned(dst_char, &target[j]);
451 	}
452 
453 	return j;
454 }
455