xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision 66492cf01c4f0eb178cb6e056451d04be61a0374)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 /*
29  * Multibyte/wide-char conversion routines. Wide-char encoding provides
30  * a fixed size character encoding that maps to the Unicode 16-bit
31  * (UCS-2) character set standard. Multibyte or UCS transformation
32  * format (UTF) encoding is a variable length character encoding scheme
33  * that s compatible with existing ASCII characters and guarantees that
34  * the resultant strings do not contain embedded null characters. Both
35  * types of encoding provide a null terminator: single byte for UTF-8
36  * and a wide-char null for Unicode. See RFC 2044.
37  *
38  * The table below illustrates the UTF-8 encoding scheme. The letter x
39  * indicates bits available for encoding the character value.
40  *
41  *	UCS-2			UTF-8 octet sequence (binary)
42  *	0x0000-0x007F	0xxxxxxx
43  *	0x0080-0x07FF	110xxxxx 10xxxxxx
44  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
45  *
46  * RFC 2044
47  * UTF-8,a transformation format of UNICODE and ISO 10646
48  * F. Yergeau
49  * Alis Technologies
50  * October 1996
51  */
52 
53 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
56 #else
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <assert.h>
60 #include <strings.h>
61 #endif
62 #include <smbsrv/string.h>
63 
64 
65 /*
66  * mbstowcs
67  *
68  * The mbstowcs() function converts a multibyte character string
69  * mbstring into a wide character string wcstring. No more than
70  * nwchars wide characters are stored. A terminating null wide
71  * character is appended if there is room.
72  *
73  * Returns the number of wide characters converted, not counting
74  * any terminating null wide character. Returns -1 if an invalid
75  * multibyte character is encountered.
76  */
77 size_t
78 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
79 {
80 	int len;
81 	smb_wchar_t	*start = wcstring;
82 
83 	while (nwchars--) {
84 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
85 		if (len < 0) {
86 			*wcstring = 0;
87 			return ((size_t)-1);
88 		}
89 
90 		if (*mbstring == 0)
91 			break;
92 
93 		++wcstring;
94 		mbstring += len;
95 	}
96 
97 	return (wcstring - start);
98 }
99 
100 
101 /*
102  * mbtowc
103  *
104  * The mbtowc() function converts a multibyte character mbchar into
105  * a wide character and stores the result in the object pointed to
106  * by wcharp. Up to nbytes bytes are examined.
107  *
108  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
109  * states are not supported.  Shift states are used to switch between
110  * representation modes using reserved bytes to signal shifting
111  * without them being interpreted as characters.  If mbchar is null
112  * mbtowc should return non-zero if the current locale requires shift
113  * states.  Otherwise it should be return 0.
114  *
115  * If mbchar is non-null, returns the number of bytes processed in
116  * mbchar.  If mbchar is invalid, returns -1.
117  */
118 int /*ARGSUSED*/
119 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
120 {
121 	unsigned char mbyte;
122 	smb_wchar_t wide_char;
123 	int count;
124 	int bytes_left;
125 
126 	if (mbchar == NULL)
127 		return (0); /* no shift states */
128 
129 	/* 0xxxxxxx -> 1 byte ASCII encoding */
130 	if (((mbyte = *mbchar++) & 0x80) == 0) {
131 		if (wcharp)
132 			*wcharp = (smb_wchar_t)mbyte;
133 
134 		return (mbyte ? 1 : 0);
135 	}
136 
137 	/* 10xxxxxx -> invalid first byte */
138 	if ((mbyte & 0x40) == 0)
139 		return (-1);
140 
141 	wide_char = mbyte;
142 	if ((mbyte & 0x20) == 0) {
143 		wide_char &= 0x1f;
144 		bytes_left = 1;
145 	} else if ((mbyte & 0x10) == 0) {
146 		wide_char &= 0x0f;
147 		bytes_left = 2;
148 	} else {
149 		return (-1);
150 	}
151 
152 	count = 1;
153 	while (bytes_left--) {
154 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
155 			return (-1);
156 
157 		count++;
158 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
159 	}
160 
161 	if (wcharp)
162 		*wcharp = wide_char;
163 
164 	return (count);
165 }
166 
167 
168 /*
169  * wctomb
170  *
171  * The wctomb() function converts a wide character wchar into a multibyte
172  * character and stores the result in mbchar. The object pointed to by
173  * mbchar must be large enough to accommodate the multibyte character.
174  *
175  * Returns the numberof bytes written to mbchar.
176  */
177 int
178 smb_wctomb(char *mbchar, smb_wchar_t wchar)
179 {
180 	if ((wchar & ~0x7f) == 0) {
181 		*mbchar = (char)wchar;
182 		return (1);
183 	}
184 
185 	if ((wchar & ~0x7ff) == 0) {
186 		*mbchar++ = (wchar >> 6) | 0xc0;
187 		*mbchar = (wchar & 0x3f) | 0x80;
188 		return (2);
189 	}
190 
191 	*mbchar++ = (wchar >> 12) | 0xe0;
192 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
193 	*mbchar = (wchar & 0x3f) | 0x80;
194 	return (3);
195 }
196 
197 
198 /*
199  * wcstombs
200  *
201  * The wcstombs() function converts a wide character string wcstring
202  * into a multibyte character string mbstring. Up to nbytes bytes are
203  * stored in mbstring. Partial multibyte characters at the end of the
204  * string are not stored. The multibyte character string is null
205  * terminated if there is room.
206  *
207  * Returns the number of bytes converted, not counting the terminating
208  * null byte.
209  */
210 size_t
211 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
212 {
213 	char *start = mbstring;
214 	const smb_wchar_t *wcp = wcstring;
215 	smb_wchar_t wide_char = 0;
216 	char buf[4];
217 	size_t len;
218 
219 	if ((mbstring == NULL) || (wcstring == NULL))
220 		return (0);
221 
222 	while (nbytes > MTS_MB_CHAR_MAX) {
223 		wide_char = *wcp++;
224 		len = smb_wctomb(mbstring, wide_char);
225 
226 		if (wide_char == 0)
227 			/*LINTED E_PTRDIFF_OVERFLOW*/
228 			return (mbstring - start);
229 
230 		mbstring += len;
231 		nbytes -= len;
232 	}
233 
234 	while (wide_char && nbytes) {
235 		wide_char = *wcp++;
236 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
237 			*mbstring = 0;
238 			break;
239 		}
240 
241 		bcopy(buf, mbstring, len);
242 		mbstring += len;
243 		nbytes -= len;
244 	}
245 
246 	/*LINTED E_PTRDIFF_OVERFLOW*/
247 	return (mbstring - start);
248 }
249 
250 
251 /*
252  * Returns the number of bytes that would be written if the multi-
253  * byte string mbs was converted to a wide character string, not
254  * counting the terminating null wide character.
255  */
256 size_t
257 smb_wcequiv_strlen(const char *mbs)
258 {
259 	smb_wchar_t	wide_char;
260 	size_t bytes;
261 	size_t len = 0;
262 
263 	while (*mbs) {
264 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
265 		if (bytes == ((size_t)-1))
266 			return ((size_t)-1);
267 
268 		len += sizeof (smb_wchar_t);
269 		mbs += bytes;
270 	}
271 
272 	return (len);
273 }
274 
275 
276 /*
277  * Returns the number of bytes that would be written if the multi-
278  * byte string mbs was converted to a single byte character string,
279  * not counting the terminating null character.
280  */
281 size_t
282 smb_sbequiv_strlen(const char *mbs)
283 {
284 	smb_wchar_t	wide_char;
285 	size_t nbytes;
286 	size_t len = 0;
287 
288 	while (*mbs) {
289 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
290 		if (nbytes == ((size_t)-1))
291 			return ((size_t)-1);
292 
293 		if (wide_char & 0xFF00)
294 			len += sizeof (smb_wchar_t);
295 		else
296 			++len;
297 
298 		mbs += nbytes;
299 	}
300 
301 	return (len);
302 }
303 
304 
305 /*
306  * stombs
307  *
308  * Convert a regular null terminated string 'string' to a UTF-8 encoded
309  * null terminated multi-byte string 'mbstring'. Only full converted
310  * UTF-8 characters will be written 'mbstring'. If a character will not
311  * fit within the remaining buffer space or 'mbstring' will overflow
312  * max_mblen, the conversion process will be terminated and 'mbstring'
313  * will be null terminated.
314  *
315  * Returns the number of bytes written to 'mbstring', excluding the
316  * terminating null character.
317  *
318  * If either mbstring or string is a null pointer, -1 is returned.
319  */
320 int
321 smb_stombs(char *mbstring, char *string, int max_mblen)
322 {
323 	char *start = mbstring;
324 	unsigned char *p = (unsigned char *)string;
325 	int space_left = max_mblen;
326 	int	len;
327 	smb_wchar_t	wide_char;
328 	char buf[4];
329 
330 	if (!mbstring || !string)
331 		return (-1);
332 
333 	while (*p && space_left > 2) {
334 		wide_char = *p++;
335 		len = smb_wctomb(mbstring, wide_char);
336 		mbstring += len;
337 		space_left -= len;
338 	}
339 
340 	if (*p) {
341 		wide_char = *p;
342 		if ((len = smb_wctomb(buf, wide_char)) < 2) {
343 			*mbstring = *buf;
344 			mbstring += len;
345 			space_left -= len;
346 		}
347 	}
348 
349 	*mbstring = '\0';
350 
351 	/*LINTED E_PTRDIFF_OVERFLOW*/
352 	return (mbstring - start);
353 }
354 
355 
356 /*
357  * mbstos
358  *
359  * Convert a null terminated multi-byte string 'mbstring' to a regular
360  * null terminated string 'string'.  A 1-byte character in 'mbstring'
361  * maps to a 1-byte character in 'string'. A 2-byte character in
362  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
363  * Otherwise the upper byte null will be discarded to ensure that the
364  * output stream does not contain embedded null characters.
365  *
366  * If the input stream contains invalid multi-byte characters, a value
367  * of -1 will be returned. Otherwise the length of 'string', excluding
368  * the terminating null character, is returned.
369  *
370  * If either mbstring or string is a null pointer, -1 is returned.
371  */
372 int
373 smb_mbstos(char *string, const char *mbstring)
374 {
375 	smb_wchar_t wc;
376 	unsigned char *start = (unsigned char *)string;
377 	int len;
378 
379 	if (string == NULL || mbstring == NULL)
380 		return (-1);
381 
382 	while (*mbstring) {
383 		if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
384 			*string = 0;
385 			return (-1);
386 		}
387 
388 		if (wc & 0xFF00) {
389 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
390 			*((smb_wchar_t *)string) = wc;
391 			string += sizeof (smb_wchar_t);
392 		}
393 		else
394 		{
395 			*string = (unsigned char)wc;
396 			string++;
397 		}
398 
399 		mbstring += len;
400 	}
401 
402 	*string = 0;
403 
404 	/*LINTED E_PTRDIFF_OVERFLOW*/
405 	return ((unsigned char *)string - start);
406 }
407