xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision 07a6ae61f8958faa11352bf1b552d85d79e9cbbe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 /*
29  * Multibyte/wide-char conversion routines. Wide-char encoding provides
30  * a fixed size character encoding that maps to the Unicode 16-bit
31  * (UCS-2) character set standard. Multibyte or UCS transformation
32  * format (UTF) encoding is a variable length character encoding scheme
33  * that s compatible with existing ASCII characters and guarantees that
34  * the resultant strings do not contain embedded null characters. Both
35  * types of encoding provide a null terminator: single byte for UTF-8
36  * and a wide-char null for Unicode. See RFC 2044.
37  *
38  * The table below illustrates the UTF-8 encoding scheme. The letter x
39  * indicates bits available for encoding the character value.
40  *
41  *	UCS-2			UTF-8 octet sequence (binary)
42  *	0x0000-0x007F	0xxxxxxx
43  *	0x0080-0x07FF	110xxxxx 10xxxxxx
44  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
45  *
46  * RFC 2044
47  * UTF-8,a transformation format of UNICODE and ISO 10646
48  * F. Yergeau
49  * Alis Technologies
50  * October 1996
51  */
52 
53 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
56 #else	/* _KERNEL || _FAKE_KERNEL */
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <strings.h>
60 #include <iconv.h>
61 #include <assert.h>
62 #endif	/* _KERNEL || _FAKE_KERNEL */
63 #include <smbsrv/string.h>
64 
65 
66 /*
67  * mbstowcs
68  *
69  * The mbstowcs() function converts a multibyte character string
70  * mbstring into a wide character string wcstring. No more than
71  * nwchars wide characters are stored. A terminating null wide
72  * character is appended if there is room.
73  *
74  * Returns the number of wide characters converted, not counting
75  * any terminating null wide character. Returns -1 if an invalid
76  * multibyte character is encountered.
77  */
78 size_t
79 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
80 {
81 	int len;
82 	smb_wchar_t	*start = wcstring;
83 
84 	while (nwchars--) {
85 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
86 		if (len < 0) {
87 			*wcstring = 0;
88 			return ((size_t)-1);
89 		}
90 
91 		if (*mbstring == 0)
92 			break;
93 
94 		++wcstring;
95 		mbstring += len;
96 	}
97 
98 	return (wcstring - start);
99 }
100 
101 
102 /*
103  * mbtowc
104  *
105  * The mbtowc() function converts a multibyte character mbchar into
106  * a wide character and stores the result in the object pointed to
107  * by wcharp. Up to nbytes bytes are examined.
108  *
109  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
110  * states are not supported.  Shift states are used to switch between
111  * representation modes using reserved bytes to signal shifting
112  * without them being interpreted as characters.  If mbchar is null
113  * mbtowc should return non-zero if the current locale requires shift
114  * states.  Otherwise it should be return 0.
115  *
116  * If mbchar is non-null, returns the number of bytes processed in
117  * mbchar.  If mbchar is invalid, returns -1.
118  */
119 int /*ARGSUSED*/
120 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
121 {
122 	unsigned char mbyte;
123 	smb_wchar_t wide_char;
124 	int count;
125 	int bytes_left;
126 
127 	if (mbchar == NULL)
128 		return (0); /* no shift states */
129 
130 	/* 0xxxxxxx -> 1 byte ASCII encoding */
131 	if (((mbyte = *mbchar++) & 0x80) == 0) {
132 		if (wcharp)
133 			*wcharp = (smb_wchar_t)mbyte;
134 
135 		return (mbyte ? 1 : 0);
136 	}
137 
138 	/* 10xxxxxx -> invalid first byte */
139 	if ((mbyte & 0x40) == 0)
140 		return (-1);
141 
142 	wide_char = mbyte;
143 	if ((mbyte & 0x20) == 0) {
144 		wide_char &= 0x1f;
145 		bytes_left = 1;
146 	} else if ((mbyte & 0x10) == 0) {
147 		wide_char &= 0x0f;
148 		bytes_left = 2;
149 	} else {
150 		return (-1);
151 	}
152 
153 	count = 1;
154 	while (bytes_left--) {
155 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
156 			return (-1);
157 
158 		count++;
159 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
160 	}
161 
162 	if (wcharp)
163 		*wcharp = wide_char;
164 
165 	return (count);
166 }
167 
168 
169 /*
170  * wctomb
171  *
172  * The wctomb() function converts a wide character wchar into a multibyte
173  * character and stores the result in mbchar. The object pointed to by
174  * mbchar must be large enough to accommodate the multibyte character.
175  *
176  * Returns the numberof bytes written to mbchar.
177  */
178 int
179 smb_wctomb(char *mbchar, smb_wchar_t wchar)
180 {
181 	if ((wchar & ~0x7f) == 0) {
182 		*mbchar = (char)wchar;
183 		return (1);
184 	}
185 
186 	if ((wchar & ~0x7ff) == 0) {
187 		*mbchar++ = (wchar >> 6) | 0xc0;
188 		*mbchar = (wchar & 0x3f) | 0x80;
189 		return (2);
190 	}
191 
192 	*mbchar++ = (wchar >> 12) | 0xe0;
193 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
194 	*mbchar = (wchar & 0x3f) | 0x80;
195 	return (3);
196 }
197 
198 
199 /*
200  * wcstombs
201  *
202  * The wcstombs() function converts a wide character string wcstring
203  * into a multibyte character string mbstring. Up to nbytes bytes are
204  * stored in mbstring. Partial multibyte characters at the end of the
205  * string are not stored. The multibyte character string is null
206  * terminated if there is room.
207  *
208  * Returns the number of bytes converted, not counting the terminating
209  * null byte.
210  */
211 size_t
212 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
213 {
214 	char *start = mbstring;
215 	const smb_wchar_t *wcp = wcstring;
216 	smb_wchar_t wide_char = 0;
217 	char buf[4];
218 	size_t len;
219 
220 	if ((mbstring == NULL) || (wcstring == NULL))
221 		return (0);
222 
223 	while (nbytes > MTS_MB_CHAR_MAX) {
224 		wide_char = *wcp++;
225 		len = smb_wctomb(mbstring, wide_char);
226 
227 		if (wide_char == 0)
228 			/*LINTED E_PTRDIFF_OVERFLOW*/
229 			return (mbstring - start);
230 
231 		mbstring += len;
232 		nbytes -= len;
233 	}
234 
235 	while (wide_char && nbytes) {
236 		wide_char = *wcp++;
237 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
238 			*mbstring = 0;
239 			break;
240 		}
241 
242 		bcopy(buf, mbstring, len);
243 		mbstring += len;
244 		nbytes -= len;
245 	}
246 
247 	/*LINTED E_PTRDIFF_OVERFLOW*/
248 	return (mbstring - start);
249 }
250 
251 
252 /*
253  * Returns the number of bytes that would be written if the multi-
254  * byte string mbs was converted to a wide character string, not
255  * counting the terminating null wide character.
256  */
257 size_t
258 smb_wcequiv_strlen(const char *mbs)
259 {
260 	smb_wchar_t	wide_char;
261 	size_t bytes;
262 	size_t len = 0;
263 
264 	while (*mbs) {
265 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
266 		if (bytes == ((size_t)-1))
267 			return ((size_t)-1);
268 
269 		len += sizeof (smb_wchar_t);
270 		mbs += bytes;
271 	}
272 
273 	return (len);
274 }
275 
276 
277 /*
278  * Returns the number of bytes that would be written if the multi-
279  * byte string mbs was converted to an OEM character string,
280  * not counting the terminating null character.
281  */
282 size_t
283 smb_sbequiv_strlen(const char *mbs)
284 {
285 	smb_wchar_t	wide_char;
286 	size_t nbytes;
287 	size_t len = 0;
288 
289 	while (*mbs) {
290 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
291 		if (nbytes == ((size_t)-1))
292 			return ((size_t)-1);
293 
294 		/*
295 		 * Assume OEM characters are 1-byte (for now).
296 		 * That's true for cp850, which is the only
297 		 * codeset this currently supports.  See:
298 		 * smb_oem.c : smb_oem_codeset
299 		 */
300 		++len;
301 
302 		mbs += nbytes;
303 	}
304 
305 	return (len);
306 }
307 
308 /*
309  * Convert OEM strings to/from internal (UTF-8) form.
310  *
311  * We rarely encounter these anymore because all modern
312  * SMB clients use Unicode (UTF-16). The few cases where
313  * this IS still called are normally using ASCII, i.e.
314  * tag names etc. so short-cut those cases.  If we get
315  * something non-ASCII we have to call iconv.
316  *
317  * If we were to really support OEM code pages, we would
318  * need to have a way to set the OEM code page from some
319  * configuration value.  For now it's always CP850.
320  * See also ./smb_oem.c
321  */
322 static char smb_oem_codepage[32] = "CP850";
323 
324 /*
325  * smb_oemtombs
326  *
327  * Convert a null terminated OEM string 'string' to a UTF-8 string
328  * no longer than max_mblen (null terminated if space).
329  *
330  * If the input string contains invalid OEM characters, a value
331  * of -1 will be returned. Otherwise returns the length of 'mbs',
332  * excluding the terminating null character.
333  *
334  * If either mbstring or string is a null pointer, -1 is returned.
335  */
336 int
337 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
338 {
339 	uchar_t *p;
340 	int	oemlen;
341 	int	rlen;
342 	boolean_t need_iconv = B_FALSE;
343 
344 	if (mbs == NULL || oems == NULL)
345 		return (-1);
346 
347 	/*
348 	 * Check if the oems is all ASCII (and get the length
349 	 * while we're at it) so we know if we need to iconv.
350 	 * We usually can avoid the iconv calls.
351 	 */
352 	oemlen = 0;
353 	p = (uchar_t *)oems;
354 	while (*p != '\0') {
355 		oemlen++;
356 		if (*p & 0x80)
357 			need_iconv = B_TRUE;
358 		p++;
359 	}
360 
361 	if (need_iconv) {
362 		int	rc;
363 		char	*obuf = mbs;
364 		size_t	olen = max_mblen;
365 		size_t	ilen = oemlen;
366 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
367 		char *ibuf = (char *)oems;
368 		kiconv_t ic;
369 		int	err;
370 
371 		ic = kiconv_open("UTF-8", smb_oem_codepage);
372 		if (ic == (kiconv_t)-1)
373 			goto just_copy;
374 		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
375 		(void) kiconv_close(ic);
376 #else	/* _KERNEL || _FAKE_KERNEL */
377 		const char *ibuf = (char *)oems;
378 		iconv_t	ic;
379 		ic = iconv_open("UTF-8", smb_oem_codepage);
380 		if (ic == (iconv_t)-1)
381 			goto just_copy;
382 		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
383 		(void) iconv_close(ic);
384 #endif	/* _KERNEL || _FAKE_KERNEL */
385 		if (rc < 0)
386 			return (-1);
387 		/* Return val. is output bytes. */
388 		rlen = (max_mblen - olen);
389 	} else {
390 	just_copy:
391 		rlen = oemlen;
392 		if (rlen > max_mblen)
393 			rlen = max_mblen;
394 		bcopy(oems, mbs, rlen);
395 	}
396 	if (rlen < max_mblen)
397 		mbs[rlen] = '\0';
398 
399 	return (rlen);
400 }
401 
402 /*
403  * smb_mbstooem
404  *
405  * Convert a null terminated multi-byte string 'mbs' to an OEM string
406  * no longer than max_oemlen (null terminated if space).
407  *
408  * If the input string contains invalid multi-byte characters, a value
409  * of -1 will be returned. Otherwise returns the length of 'oems',
410  * excluding the terminating null character.
411  *
412  * If either mbstring or string is a null pointer, -1 is returned.
413  */
414 int
415 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
416 {
417 	uchar_t *p;
418 	int	mbslen;
419 	int	rlen;
420 	boolean_t need_iconv = B_FALSE;
421 
422 	if (oems == NULL || mbs == NULL)
423 		return (-1);
424 
425 	/*
426 	 * Check if the mbs is all ASCII (and get the length
427 	 * while we're at it) so we know if we need to iconv.
428 	 * We usually can avoid the iconv calls.
429 	 */
430 	mbslen = 0;
431 	p = (uchar_t *)mbs;
432 	while (*p != '\0') {
433 		mbslen++;
434 		if (*p & 0x80)
435 			need_iconv = B_TRUE;
436 		p++;
437 	}
438 
439 	if (need_iconv) {
440 		int	rc;
441 		char	*obuf = (char *)oems;
442 		size_t	olen = max_oemlen;
443 		size_t	ilen = mbslen;
444 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
445 		char *ibuf = (char *)mbs;
446 		kiconv_t ic;
447 		int	err;
448 
449 		ic = kiconv_open(smb_oem_codepage, "UTF-8");
450 		if (ic == (kiconv_t)-1)
451 			goto just_copy;
452 		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
453 		(void) kiconv_close(ic);
454 #else	/* _KERNEL || _FAKE_KERNEL */
455 		const char *ibuf = mbs;
456 		iconv_t	ic;
457 		ic = iconv_open(smb_oem_codepage, "UTF-8");
458 		if (ic == (iconv_t)-1)
459 			goto just_copy;
460 		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
461 		(void) iconv_close(ic);
462 #endif	/* _KERNEL || _FAKE_KERNEL */
463 		if (rc < 0)
464 			return (-1);
465 		/* Return val. is output bytes. */
466 		rlen = (max_oemlen - olen);
467 	} else {
468 	just_copy:
469 		rlen = mbslen;
470 		if (rlen > max_oemlen)
471 			rlen = max_oemlen;
472 		bcopy(mbs, oems, rlen);
473 	}
474 	if (rlen < max_oemlen)
475 		oems[rlen] = '\0';
476 
477 	return (rlen);
478 }
479