xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision 9d6ca3965c3358c32eb68544fe91ff8ad9c3fcde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 /*
29  * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire
30  * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs).
31  */
32 
33 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
34 #include <sys/types.h>
35 #include <sys/sunddi.h>
36 #else	/* _KERNEL || _FAKE_KERNEL */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <strings.h>
40 #include <iconv.h>
41 #include <assert.h>
42 #endif	/* _KERNEL || _FAKE_KERNEL */
43 #include <sys/u8_textprep.h>
44 #include <smbsrv/string.h>
45 
46 
47 /*
48  * mbstowcs
49  *
50  * The mbstowcs() function converts a multibyte character string
51  * mbstring into a wide character string wcstring. No more than
52  * nwchars wide characters are stored. A terminating null wide
53  * character is appended if there is room.
54  *
55  * Returns the number of wide characters converted, not counting
56  * any terminating null wide character. Returns -1 if an invalid
57  * multibyte character is encountered.
58  */
59 size_t
60 smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars)
61 {
62 	size_t mbslen, wcslen;
63 	int err;
64 
65 	/* NULL or empty input is allowed. */
66 	if (mbs == NULL || *mbs == '\0') {
67 		if (wcs != NULL && nwchars > 0)
68 			*wcs = 0;
69 		return (0);
70 	}
71 
72 	/*
73 	 * Traditional mbstowcs(3C) allows wcs==NULL to get the length.
74 	 * SMB never calls it that way, but let's future-proof.
75 	 */
76 	if (wcs == NULL) {
77 		return ((size_t)-1);
78 	}
79 
80 	mbslen = strlen(mbs);
81 	wcslen = nwchars;
82 	err = uconv_u8tou16((const uchar_t *)mbs, &mbslen,
83 	    wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN);
84 	if (err != 0)
85 		return ((size_t)-1);
86 
87 	if (wcslen < nwchars)
88 		wcs[wcslen] = 0;
89 
90 	return (wcslen);
91 }
92 
93 
94 /*
95  * mbtowc
96  *
97  * The mbtowc() function converts a multibyte character mbchar into
98  * a wide character and stores the result in the object pointed to
99  * by wcharp. Up to nbytes bytes are examined.
100  *
101  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
102  * states are not supported.  Shift states are used to switch between
103  * representation modes using reserved bytes to signal shifting
104  * without them being interpreted as characters.  If mbchar is null
105  * mbtowc should return non-zero if the current locale requires shift
106  * states.  Otherwise it should be return 0.
107  *
108  * If mbchar is non-null, returns the number of bytes processed in
109  * mbchar.  If mbchar is null, convert the null (wcharp=0) but
110  * return length zero.  If mbchar is invalid, returns -1.
111  */
112 int /*ARGSUSED*/
113 smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes)
114 {
115 	uint32_t wide_char;
116 	int count, err;
117 	size_t mblen;
118 	size_t wclen;
119 
120 	if (mbchar == NULL)
121 		return (0); /* no shift states */
122 
123 	/*
124 	 * How many bytes in this symbol?
125 	 */
126 	count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err);
127 	if (count < 0)
128 		return (-1);
129 
130 	mblen = count;
131 	wclen = 1;
132 	err = uconv_u8tou32((const uchar_t *)mbchar, &mblen,
133 	    &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN);
134 	if (err != 0)
135 		return (-1);
136 	if (wclen == 0) {
137 		wide_char = 0;
138 		count = 0;
139 	}
140 
141 	if (wcharp)
142 		*wcharp = wide_char;
143 
144 	return (count);
145 }
146 
147 
148 /*
149  * wctomb
150  *
151  * The wctomb() function converts a wide character wchar into a multibyte
152  * character and stores the result in mbchar. The object pointed to by
153  * mbchar must be large enough to accommodate the multibyte character.
154  *
155  * Returns the numberof bytes written to mbchar.
156  * Note: handles null like any 1-byte char.
157  */
158 int
159 smb_wctomb(char *mbchar, uint32_t wchar)
160 {
161 	char junk[MTS_MB_CUR_MAX+1];
162 	size_t mblen;
163 	size_t wclen;
164 	int err;
165 
166 	if (mbchar == NULL)
167 		mbchar = junk;
168 
169 	mblen = MTS_MB_CUR_MAX;
170 	wclen = 1;
171 	err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen,
172 	    UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL);
173 	if (err != 0)
174 		return (-1);
175 
176 	return ((int)mblen);
177 }
178 
179 
180 /*
181  * wcstombs
182  *
183  * The wcstombs() function converts a wide character string wcstring
184  * into a multibyte character string mbstring. Up to nbytes bytes are
185  * stored in mbstring. Partial multibyte characters at the end of the
186  * string are not stored. The multibyte character string is null
187  * terminated if there is room.
188  *
189  * Returns the number of bytes converted, not counting the terminating
190  * null byte. Returns -1 if an invalid WC sequence is encountered.
191  */
192 size_t
193 smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes)
194 {
195 	size_t mbslen, wcslen;
196 	int err;
197 
198 	/* NULL or empty input is allowed. */
199 	if (wcs == NULL || *wcs == 0) {
200 		if (mbs != NULL && nbytes > 0)
201 			*mbs = '\0';
202 		return (0);
203 	}
204 
205 	/*
206 	 * Traditional wcstombs(3C) allows mbs==NULL to get the length.
207 	 * SMB never calls it that way, but let's future-proof.
208 	 */
209 	if (mbs == NULL) {
210 		return ((size_t)-1);
211 	}
212 
213 	/*
214 	 * Compute wcslen
215 	 */
216 	wcslen = 0;
217 	while (wcs[wcslen] != 0)
218 		wcslen++;
219 
220 	mbslen = nbytes;
221 	err = uconv_u16tou8(wcs, &wcslen,
222 	    (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN);
223 	if (err != 0)
224 		return ((size_t)-1);
225 
226 	if (mbslen < nbytes)
227 		mbs[mbslen] = '\0';
228 
229 	return (mbslen);
230 }
231 
232 
233 /*
234  * Returns the number of bytes that would be written if the multi-
235  * byte string mbs was converted to a wide character string, not
236  * counting the terminating null wide character.
237  */
238 size_t
239 smb_wcequiv_strlen(const char *mbs)
240 {
241 	uint32_t	wide_char;
242 	size_t bytes;
243 	size_t len = 0;
244 
245 	while (*mbs) {
246 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
247 		if (bytes == ((size_t)-1))
248 			return ((size_t)-1);
249 		mbs += bytes;
250 
251 		len += sizeof (smb_wchar_t);
252 		if (bytes > 3) {
253 			/*
254 			 * Extended unicode, so TWO smb_wchar_t
255 			 */
256 			len += sizeof (smb_wchar_t);
257 		}
258 	}
259 
260 	return (len);
261 }
262 
263 
264 /*
265  * Returns the number of bytes that would be written if the multi-
266  * byte string mbs was converted to an OEM character string,
267  * (smb_mbstooem) not counting the terminating null character.
268  */
269 size_t
270 smb_sbequiv_strlen(const char *mbs)
271 {
272 	size_t nbytes;
273 	size_t len = 0;
274 
275 	while (*mbs) {
276 		nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX);
277 		if (nbytes == ((size_t)-1))
278 			return ((size_t)-1);
279 		if (nbytes == 0)
280 			break;
281 
282 		if (nbytes == 1) {
283 			/* ASCII */
284 			len++;
285 		} else if (nbytes < 8) {
286 			/* Compute OEM length */
287 			char mbsbuf[8];
288 			uint8_t oembuf[8];
289 			int oemlen;
290 			(void) strlcpy(mbsbuf, mbs, nbytes+1);
291 			oemlen = smb_mbstooem(oembuf, mbsbuf, 8);
292 			if (oemlen < 0)
293 				return ((size_t)-1);
294 			len += oemlen;
295 		} else {
296 			return ((size_t)-1);
297 		}
298 
299 		mbs += nbytes;
300 	}
301 
302 	return (len);
303 }
304 
305 /*
306  * Convert OEM strings to/from internal (UTF-8) form.
307  *
308  * We rarely encounter these anymore because all modern
309  * SMB clients use Unicode (UTF-16). The few cases where
310  * this IS still called are normally using ASCII, i.e.
311  * tag names etc. so short-cut those cases.  If we get
312  * something non-ASCII we have to call iconv.
313  *
314  * If we were to really support OEM code pages, we would
315  * need to have a way to set the OEM code page from some
316  * configuration value.  For now it's always CP850.
317  * See also ./smb_oem.c
318  */
319 static char smb_oem_codepage[32] = "CP850";
320 
321 /*
322  * smb_oemtombs
323  *
324  * Convert a null terminated OEM string 'string' to a UTF-8 string
325  * no longer than max_mblen (null terminated if space).
326  *
327  * If the input string contains invalid OEM characters, a value
328  * of -1 will be returned. Otherwise returns the length of 'mbs',
329  * excluding the terminating null character.
330  *
331  * If either mbstring or string is a null pointer, -1 is returned.
332  */
333 int
334 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
335 {
336 	uchar_t *p;
337 	int	oemlen;
338 	int	rlen;
339 	boolean_t need_iconv = B_FALSE;
340 
341 	if (mbs == NULL || oems == NULL)
342 		return (-1);
343 
344 	/*
345 	 * Check if the oems is all ASCII (and get the length
346 	 * while we're at it) so we know if we need to iconv.
347 	 * We usually can avoid the iconv calls.
348 	 */
349 	oemlen = 0;
350 	p = (uchar_t *)oems;
351 	while (*p != '\0') {
352 		oemlen++;
353 		if (*p & 0x80)
354 			need_iconv = B_TRUE;
355 		p++;
356 	}
357 
358 	if (need_iconv) {
359 		int	rc;
360 		char	*obuf = mbs;
361 		size_t	olen = max_mblen;
362 		size_t	ilen = oemlen;
363 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
364 		char *ibuf = (char *)oems;
365 		kiconv_t ic;
366 		int	err;
367 
368 		ic = kiconv_open("UTF-8", smb_oem_codepage);
369 		if (ic == (kiconv_t)-1)
370 			goto just_copy;
371 		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
372 		(void) kiconv_close(ic);
373 #else	/* _KERNEL || _FAKE_KERNEL */
374 		const char *ibuf = (char *)oems;
375 		iconv_t	ic;
376 		ic = iconv_open("UTF-8", smb_oem_codepage);
377 		if (ic == (iconv_t)-1)
378 			goto just_copy;
379 		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
380 		(void) iconv_close(ic);
381 #endif	/* _KERNEL || _FAKE_KERNEL */
382 		if (rc < 0)
383 			return (-1);
384 		/* Return val. is output bytes. */
385 		rlen = (max_mblen - olen);
386 	} else {
387 	just_copy:
388 		rlen = oemlen;
389 		if (rlen > max_mblen)
390 			rlen = max_mblen;
391 		bcopy(oems, mbs, rlen);
392 	}
393 	if (rlen < max_mblen)
394 		mbs[rlen] = '\0';
395 
396 	return (rlen);
397 }
398 
399 /*
400  * smb_mbstooem
401  *
402  * Convert a null terminated multi-byte string 'mbs' to an OEM string
403  * no longer than max_oemlen (null terminated if space).
404  *
405  * If the input string contains invalid multi-byte characters, a value
406  * of -1 will be returned. Otherwise returns the length of 'oems',
407  * excluding the terminating null character.
408  *
409  * If either mbstring or string is a null pointer, -1 is returned.
410  */
411 int
412 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
413 {
414 	uchar_t *p;
415 	int	mbslen;
416 	int	rlen;
417 	boolean_t need_iconv = B_FALSE;
418 
419 	if (oems == NULL || mbs == NULL)
420 		return (-1);
421 
422 	/*
423 	 * Check if the mbs is all ASCII (and get the length
424 	 * while we're at it) so we know if we need to iconv.
425 	 * We usually can avoid the iconv calls.
426 	 */
427 	mbslen = 0;
428 	p = (uchar_t *)mbs;
429 	while (*p != '\0') {
430 		mbslen++;
431 		if (*p & 0x80)
432 			need_iconv = B_TRUE;
433 		p++;
434 	}
435 
436 	if (need_iconv) {
437 		int	rc;
438 		char	*obuf = (char *)oems;
439 		size_t	olen = max_oemlen;
440 		size_t	ilen = mbslen;
441 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
442 		char *ibuf = (char *)mbs;
443 		kiconv_t ic;
444 		int	err;
445 
446 		ic = kiconv_open(smb_oem_codepage, "UTF-8");
447 		if (ic == (kiconv_t)-1)
448 			goto just_copy;
449 		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
450 		(void) kiconv_close(ic);
451 #else	/* _KERNEL || _FAKE_KERNEL */
452 		const char *ibuf = mbs;
453 		iconv_t	ic;
454 		ic = iconv_open(smb_oem_codepage, "UTF-8");
455 		if (ic == (iconv_t)-1)
456 			goto just_copy;
457 		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
458 		(void) iconv_close(ic);
459 #endif	/* _KERNEL || _FAKE_KERNEL */
460 		if (rc < 0)
461 			return (-1);
462 		/* Return val. is output bytes. */
463 		rlen = (max_oemlen - olen);
464 	} else {
465 	just_copy:
466 		rlen = mbslen;
467 		if (rlen > max_oemlen)
468 			rlen = max_oemlen;
469 		bcopy(mbs, oems, rlen);
470 	}
471 	if (rlen < max_oemlen)
472 		oems[rlen] = '\0';
473 
474 	return (rlen);
475 }
476