xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision 52244c0958bdf281ca42932b449f644b4decfdc2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multibyte/wide-char conversion routines. Wide-char encoding provides
28  * a fixed size character encoding that maps to the Unicode 16-bit
29  * (UCS-2) character set standard. Multibyte or UCS transformation
30  * format (UTF) encoding is a variable length character encoding scheme
31  * that s compatible with existing ASCII characters and guarantees that
32  * the resultant strings do not contain embedded null characters. Both
33  * types of encoding provide a null terminator: single byte for UTF-8
34  * and a wide-char null for Unicode. See RFC 2044.
35  *
36  * The table below illustrates the UTF-8 encoding scheme. The letter x
37  * indicates bits available for encoding the character value.
38  *
39  *	UCS-2			UTF-8 octet sequence (binary)
40  *	0x0000-0x007F	0xxxxxxx
41  *	0x0080-0x07FF	110xxxxx 10xxxxxx
42  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
43  *
44  * RFC 2044
45  * UTF-8,a transformation format of UNICODE and ISO 10646
46  * F. Yergeau
47  * Alis Technologies
48  * October 1996
49  */
50 
51 #ifdef _KERNEL
52 #include <sys/types.h>
53 #include <sys/sunddi.h>
54 #else
55 #include <stdio.h>
56 #include <stdlib.h>
57 #include <assert.h>
58 #include <strings.h>
59 #endif
60 #include <smbsrv/string.h>
61 
62 
63 /*
64  * mbstowcs
65  *
66  * The mbstowcs() function converts a multibyte character string
67  * mbstring into a wide character string wcstring. No more than
68  * nwchars wide characters are stored. A terminating null wide
69  * character is appended if there is room.
70  *
71  * Returns the number of wide characters converted, not counting
72  * any terminating null wide character. Returns -1 if an invalid
73  * multibyte character is encountered.
74  */
75 size_t
76 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
77 {
78 	int len;
79 	smb_wchar_t	*start = wcstring;
80 
81 	while (nwchars--) {
82 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
83 		if (len < 0) {
84 			*wcstring = 0;
85 			return ((size_t)-1);
86 		}
87 
88 		if (*mbstring == 0)
89 			break;
90 
91 		++wcstring;
92 		mbstring += len;
93 	}
94 
95 	return (wcstring - start);
96 }
97 
98 
99 /*
100  * mbtowc
101  *
102  * The mbtowc() function converts a multibyte character mbchar into
103  * a wide character and stores the result in the object pointed to
104  * by wcharp. Up to nbytes bytes are examined.
105  *
106  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
107  * states are not supported.  Shift states are used to switch between
108  * representation modes using reserved bytes to signal shifting
109  * without them being interpreted as characters.  If mbchar is null
110  * mbtowc should return non-zero if the current locale requires shift
111  * states.  Otherwise it should be return 0.
112  *
113  * If mbchar is non-null, returns the number of bytes processed in
114  * mbchar.  If mbchar is invalid, returns -1.
115  */
116 int /*ARGSUSED*/
117 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
118 {
119 	unsigned char mbyte;
120 	smb_wchar_t wide_char;
121 	int count;
122 	int bytes_left;
123 
124 	if (mbchar == NULL)
125 		return (0); /* no shift states */
126 
127 	/* 0xxxxxxx -> 1 byte ASCII encoding */
128 	if (((mbyte = *mbchar++) & 0x80) == 0) {
129 		if (wcharp)
130 			*wcharp = (smb_wchar_t)mbyte;
131 
132 		return (mbyte ? 1 : 0);
133 	}
134 
135 	/* 10xxxxxx -> invalid first byte */
136 	if ((mbyte & 0x40) == 0)
137 		return (-1);
138 
139 	wide_char = mbyte;
140 	if ((mbyte & 0x20) == 0) {
141 		wide_char &= 0x1f;
142 		bytes_left = 1;
143 	} else if ((mbyte & 0x10) == 0) {
144 		wide_char &= 0x0f;
145 		bytes_left = 2;
146 	} else {
147 		return (-1);
148 	}
149 
150 	count = 1;
151 	while (bytes_left--) {
152 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
153 			return (-1);
154 
155 		count++;
156 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
157 	}
158 
159 	if (wcharp)
160 		*wcharp = wide_char;
161 
162 	return (count);
163 }
164 
165 
166 /*
167  * wctomb
168  *
169  * The wctomb() function converts a wide character wchar into a multibyte
170  * character and stores the result in mbchar. The object pointed to by
171  * mbchar must be large enough to accommodate the multibyte character.
172  *
173  * Returns the numberof bytes written to mbchar.
174  */
175 int
176 smb_wctomb(char *mbchar, smb_wchar_t wchar)
177 {
178 	if ((wchar & ~0x7f) == 0) {
179 		*mbchar = (char)wchar;
180 		return (1);
181 	}
182 
183 	if ((wchar & ~0x7ff) == 0) {
184 		*mbchar++ = (wchar >> 6) | 0xc0;
185 		*mbchar = (wchar & 0x3f) | 0x80;
186 		return (2);
187 	}
188 
189 	*mbchar++ = (wchar >> 12) | 0xe0;
190 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
191 	*mbchar = (wchar & 0x3f) | 0x80;
192 	return (3);
193 }
194 
195 
196 /*
197  * wcstombs
198  *
199  * The wcstombs() function converts a wide character string wcstring
200  * into a multibyte character string mbstring. Up to nbytes bytes are
201  * stored in mbstring. Partial multibyte characters at the end of the
202  * string are not stored. The multibyte character string is null
203  * terminated if there is room.
204  *
205  * Returns the number of bytes converted, not counting the terminating
206  * null byte.
207  */
208 size_t
209 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
210 {
211 	char *start = mbstring;
212 	const smb_wchar_t *wcp = wcstring;
213 	smb_wchar_t wide_char;
214 	char buf[4];
215 	size_t len;
216 
217 	if ((mbstring == NULL) || (wcstring == NULL))
218 		return (0);
219 
220 	while (nbytes > MTS_MB_CHAR_MAX) {
221 		wide_char = *wcp++;
222 		len = smb_wctomb(mbstring, wide_char);
223 
224 		if (wide_char == 0)
225 			/*LINTED E_PTRDIFF_OVERFLOW*/
226 			return (mbstring - start);
227 
228 		mbstring += len;
229 		nbytes -= len;
230 	}
231 
232 	while (wide_char && nbytes) {
233 		wide_char = *wcp++;
234 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
235 			*mbstring = 0;
236 			break;
237 		}
238 
239 		bcopy(buf, mbstring, len);
240 		mbstring += len;
241 		nbytes -= len;
242 	}
243 
244 	/*LINTED E_PTRDIFF_OVERFLOW*/
245 	return (mbstring - start);
246 }
247 
248 
249 /*
250  * Returns the number of bytes that would be written if the multi-
251  * byte string mbs was converted to a wide character string, not
252  * counting the terminating null wide character.
253  */
254 size_t
255 smb_wcequiv_strlen(const char *mbs)
256 {
257 	smb_wchar_t	wide_char;
258 	size_t bytes;
259 	size_t len = 0;
260 
261 	while (*mbs) {
262 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
263 		if (bytes == ((size_t)-1))
264 			return ((size_t)-1);
265 
266 		len += sizeof (smb_wchar_t);
267 		mbs += bytes;
268 	}
269 
270 	return (len);
271 }
272 
273 
274 /*
275  * Returns the number of bytes that would be written if the multi-
276  * byte string mbs was converted to a single byte character string,
277  * not counting the terminating null character.
278  */
279 size_t
280 smb_sbequiv_strlen(const char *mbs)
281 {
282 	smb_wchar_t	wide_char;
283 	size_t nbytes;
284 	size_t len = 0;
285 
286 	while (*mbs) {
287 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
288 		if (nbytes == ((size_t)-1))
289 			return ((size_t)-1);
290 
291 		if (wide_char & 0xFF00)
292 			len += sizeof (smb_wchar_t);
293 		else
294 			++len;
295 
296 		mbs += nbytes;
297 	}
298 
299 	return (len);
300 }
301 
302 
303 /*
304  * stombs
305  *
306  * Convert a regular null terminated string 'string' to a UTF-8 encoded
307  * null terminated multi-byte string 'mbstring'. Only full converted
308  * UTF-8 characters will be written 'mbstring'. If a character will not
309  * fit within the remaining buffer space or 'mbstring' will overflow
310  * max_mblen, the conversion process will be terminated and 'mbstring'
311  * will be null terminated.
312  *
313  * Returns the number of bytes written to 'mbstring', excluding the
314  * terminating null character.
315  *
316  * If either mbstring or string is a null pointer, -1 is returned.
317  */
318 int
319 smb_stombs(char *mbstring, char *string, int max_mblen)
320 {
321 	char *start = mbstring;
322 	unsigned char *p = (unsigned char *)string;
323 	int space_left = max_mblen;
324 	int	len;
325 	smb_wchar_t	wide_char;
326 	char buf[4];
327 
328 	if (!mbstring || !string)
329 		return (-1);
330 
331 	while (*p && space_left > 2) {
332 		wide_char = *p++;
333 		len = smb_wctomb(mbstring, wide_char);
334 		mbstring += len;
335 		space_left -= len;
336 	}
337 
338 	if (*p) {
339 		wide_char = *p;
340 		if ((len = smb_wctomb(buf, wide_char)) < 2) {
341 			*mbstring = *buf;
342 			mbstring += len;
343 			space_left -= len;
344 		}
345 	}
346 
347 	*mbstring = '\0';
348 
349 	/*LINTED E_PTRDIFF_OVERFLOW*/
350 	return (mbstring - start);
351 }
352 
353 
354 /*
355  * mbstos
356  *
357  * Convert a null terminated multi-byte string 'mbstring' to a regular
358  * null terminated string 'string'.  A 1-byte character in 'mbstring'
359  * maps to a 1-byte character in 'string'. A 2-byte character in
360  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
361  * Otherwise the upper byte null will be discarded to ensure that the
362  * output stream does not contain embedded null characters.
363  *
364  * If the input stream contains invalid multi-byte characters, a value
365  * of -1 will be returned. Otherwise the length of 'string', excluding
366  * the terminating null character, is returned.
367  *
368  * If either mbstring or string is a null pointer, -1 is returned.
369  */
370 int
371 smb_mbstos(char *string, const char *mbstring)
372 {
373 	smb_wchar_t wc;
374 	unsigned char *start = (unsigned char *)string;
375 	int len;
376 
377 	if (string == NULL || mbstring == NULL)
378 		return (-1);
379 
380 	while (*mbstring) {
381 		if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
382 			*string = 0;
383 			return (-1);
384 		}
385 
386 		if (wc & 0xFF00) {
387 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
388 			*((smb_wchar_t *)string) = wc;
389 			string += sizeof (smb_wchar_t);
390 		}
391 		else
392 		{
393 			*string = (unsigned char)wc;
394 			string++;
395 		}
396 
397 		mbstring += len;
398 	}
399 
400 	*string = 0;
401 
402 	/*LINTED E_PTRDIFF_OVERFLOW*/
403 	return ((unsigned char *)string - start);
404 }
405