xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multibyte/wide-char conversion routines. Wide-char encoding provides
28  * a fixed size character encoding that maps to the Unicode 16-bit
29  * (UCS-2) character set standard. Multibyte or UCS transformation
30  * format (UTF) encoding is a variable length character encoding scheme
31  * that s compatible with existing ASCII characters and guarantees that
32  * the resultant strings do not contain embedded null characters. Both
33  * types of encoding provide a null terminator: single byte for UTF-8
34  * and a wide-char null for Unicode. See RFC 2044.
35  *
36  * The table below illustrates the UTF-8 encoding scheme. The letter x
37  * indicates bits available for encoding the character value.
38  *
39  *	UCS-2			UTF-8 octet sequence (binary)
40  *	0x0000-0x007F	0xxxxxxx
41  *	0x0080-0x07FF	110xxxxx 10xxxxxx
42  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
43  *
44  * RFC 2044
45  * UTF-8,a transformation format of UNICODE and ISO 10646
46  * F. Yergeau
47  * Alis Technologies
48  * October 1996
49  */
50 
51 #pragma ident	"%Z%%M%	%I%	%E% SMI"
52 
53 #ifdef _KERNEL
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
56 #else
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <assert.h>
60 #include <strings.h>
61 #endif
62 #include <smbsrv/smb_i18n.h>
63 #include <smbsrv/string.h>
64 
65 int mbtowc_verbose = 0;
66 int mbtowc_announce = 0;
67 
68 /*
69  * mbstowcs
70  *
71  * The mbstowcs() function converts a multibyte character string
72  * mbstring into a wide character string wcstring. No more than
73  * nwchars wide characters are stored. A terminating null wide
74  * character is appended if there is room.
75  *
76  * Returns the number of wide characters converted, not counting
77  * any terminating null wide character. Returns -1 if an invalid
78  * multibyte character is encountered.
79  */
80 size_t
81 mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars)
82 {
83 	int len;
84 	mts_wchar_t	*start = wcstring;
85 
86 	while (nwchars--) {
87 		len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
88 		if (len < 0) {
89 			*wcstring = 0;
90 			return ((size_t)-1);
91 		}
92 
93 		if (*mbstring == 0)
94 			break;
95 
96 		++wcstring;
97 		mbstring += len;
98 	}
99 
100 	return (wcstring - start);
101 }
102 
103 
104 /*
105  * mbtowc
106  *
107  * The mbtowc() function converts a multibyte character mbchar into
108  * a wide character and stores the result in the object pointed to
109  * by wcharp. Up to nbytes bytes are examined.
110  *
111  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
112  * states are not supported. If mbchar is valid, returns the number
113  * of bytes processed in mbchar. If mbchar is invalid, returns -1.
114  */
115 int /*ARGSUSED*/
116 mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes)
117 {
118 	unsigned char mbyte;
119 	mts_wchar_t wide_char;
120 	int count;
121 	int bytes_left;
122 
123 	if (mbchar == 0)
124 		return (0); /* shift states not supported */
125 
126 	/* 0xxxxxxx -> 1 byte ASCII encoding */
127 	if (((mbyte = *mbchar++) & 0x80) == 0) {
128 		if (wcharp)
129 			*wcharp = (mts_wchar_t)mbyte;
130 
131 		return (mbyte ? 1 : 0);
132 	}
133 
134 	/* 10xxxxxx -> invalid first byte */
135 	if ((mbyte & 0x40) == 0) {
136 		if (mbtowc_verbose || mbtowc_announce == 0) {
137 			mbtowc_announce = 1;
138 		}
139 		return (-1);
140 	}
141 
142 	wide_char = mbyte;
143 	if ((mbyte & 0x20) == 0) {
144 		wide_char &= 0x1f;
145 		bytes_left = 1;
146 	} else if ((mbyte & 0x10) == 0) {
147 		wide_char &= 0x0f;
148 		bytes_left = 2;
149 	} else {
150 		if (mbtowc_verbose || mbtowc_announce == 0) {
151 			mbtowc_announce = 1;
152 		}
153 		return (-1);
154 	}
155 
156 	count = 1;
157 	while (bytes_left--) {
158 		if (((mbyte = *mbchar++) & 0xc0) != 0x80) {
159 			if (mbtowc_verbose || mbtowc_announce == 0) {
160 				mbtowc_announce = 1;
161 			}
162 			return (-1);
163 		}
164 
165 		count++;
166 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
167 	}
168 
169 	if (wcharp)
170 		*wcharp = wide_char;
171 
172 	return (count);
173 }
174 
175 
176 /*
177  * wctomb
178  *
179  * The wctomb() function converts a wide character wchar into a multibyte
180  * character and stores the result in mbchar. The object pointed to by
181  * mbchar must be large enough to accommodate the multibyte character.
182  *
183  * Returns the numberof bytes written to mbchar.
184  */
185 int
186 mts_wctomb(char *mbchar, mts_wchar_t wchar)
187 {
188 #ifdef UTF8_DEBUG
189 	char *start = mbchar;
190 #endif
191 
192 	if ((wchar & ~0x7f) == 0) {
193 		*mbchar = (char)wchar;
194 		return (1);
195 	}
196 
197 	if ((wchar & ~0x7ff) == 0) {
198 		*mbchar++ = (wchar >> 6) | 0xc0;
199 		*mbchar = (wchar & 0x3f) | 0x80;
200 		return (2);
201 	}
202 
203 	*mbchar++ = (wchar >> 12) | 0xe0;
204 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
205 	*mbchar = (wchar & 0x3f) | 0x80;
206 	return (3);
207 }
208 
209 
210 /*
211  * wcstombs
212  *
213  * The wcstombs() function converts a wide character string wcstring
214  * into a multibyte character string mbstring. Up to nbytes bytes are
215  * stored in mbstring. Partial multibyte characters at the end of the
216  * string are not stored. The multibyte character string is null
217  * terminated if there is room.
218  *
219  * Returns the number of bytes converted, not counting the terminating
220  * null byte.
221  */
222 size_t
223 mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes)
224 {
225 	char *start = mbstring;
226 	const mts_wchar_t *wcp = wcstring;
227 	mts_wchar_t wide_char;
228 	char buf[4];
229 	size_t len;
230 
231 	if ((mbstring == 0) || (wcstring == 0))
232 		return (0);
233 
234 	while (nbytes > MTS_MB_CHAR_MAX) {
235 		wide_char = *wcp++;
236 		len = mts_wctomb(mbstring, wide_char);
237 
238 		if (wide_char == 0)
239 			/*LINTED E_PTRDIFF_OVERFLOW*/
240 			return (mbstring - start);
241 
242 		mbstring += len;
243 		nbytes -= len;
244 	}
245 
246 	while (wide_char && nbytes) {
247 		wide_char = *wcp++;
248 		if ((len = mts_wctomb(buf, wide_char)) > nbytes) {
249 			*mbstring = 0;
250 			break;
251 		}
252 
253 		bcopy(buf, mbstring, len);
254 		mbstring += len;
255 		nbytes -= len;
256 	}
257 
258 	/*LINTED E_PTRDIFF_OVERFLOW*/
259 	return (mbstring - start);
260 }
261 
262 
263 /*
264  * Returns the number of bytes that would be written if the multi-
265  * byte string mbs was converted to a wide character string, not
266  * counting the terminating null wide character.
267  */
268 size_t
269 mts_wcequiv_strlen(const char *mbs)
270 {
271 	mts_wchar_t	wide_char;
272 	size_t bytes;
273 	size_t len = 0;
274 
275 	while (*mbs) {
276 		bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
277 		if (bytes == ((size_t)-1))
278 			return ((size_t)-1);
279 
280 		len += sizeof (mts_wchar_t);
281 		mbs += bytes;
282 	}
283 
284 	return (len);
285 }
286 
287 
288 /*
289  * Returns the number of bytes that would be written if the multi-
290  * byte string mbs was converted to a single byte character string,
291  * not counting the terminating null character.
292  */
293 size_t
294 mts_sbequiv_strlen(const char *mbs)
295 {
296 	mts_wchar_t	wide_char;
297 	size_t nbytes;
298 	size_t len = 0;
299 
300 	while (*mbs) {
301 		nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
302 		if (nbytes == ((size_t)-1))
303 			return ((size_t)-1);
304 
305 		if (wide_char & 0xFF00)
306 			len += sizeof (mts_wchar_t);
307 		else
308 			++len;
309 
310 		mbs += nbytes;
311 	}
312 
313 	return (len);
314 }
315 
316 
317 /*
318  * stombs
319  *
320  * Convert a regular null terminated string 'string' to a UTF-8 encoded
321  * null terminated multi-byte string 'mbstring'. Only full converted
322  * UTF-8 characters will be written 'mbstring'. If a character will not
323  * fit within the remaining buffer space or 'mbstring' will overflow
324  * max_mblen, the conversion process will be terminated and 'mbstring'
325  * will be null terminated.
326  *
327  * Returns the number of bytes written to 'mbstring', excluding the
328  * terminating null character.
329  *
330  * If either mbstring or string is a null pointer, -1 is returned.
331  */
332 int
333 mts_stombs(char *mbstring, char *string, int max_mblen)
334 {
335 	char *start = mbstring;
336 	unsigned char *p = (unsigned char *)string;
337 	int space_left = max_mblen;
338 	int	len;
339 	mts_wchar_t	wide_char;
340 	char buf[4];
341 
342 	if (!mbstring || !string)
343 		return (-1);
344 
345 	while (*p && space_left > 2) {
346 		wide_char = *p++;
347 		len = mts_wctomb(mbstring, wide_char);
348 		mbstring += len;
349 		space_left -= len;
350 	}
351 
352 	if (*p) {
353 		wide_char = *p;
354 		if ((len = mts_wctomb(buf, wide_char)) < 2) {
355 			*mbstring = *buf;
356 			mbstring += len;
357 			space_left -= len;
358 		}
359 	}
360 
361 	*mbstring = '\0';
362 
363 	/*LINTED E_PTRDIFF_OVERFLOW*/
364 	return (mbstring - start);
365 }
366 
367 
368 /*
369  * mbstos
370  *
371  * Convert a null terminated multi-byte string 'mbstring' to a regular
372  * null terminated string 'string'.  A 1-byte character in 'mbstring'
373  * maps to a 1-byte character in 'string'. A 2-byte character in
374  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
375  * Otherwise the upper byte null will be discarded to ensure that the
376  * output stream does not contain embedded null characters.
377  *
378  * If the input stream contains invalid multi-byte characters, a value
379  * of -1 will be returned. Otherwise the length of 'string', excluding
380  * the terminating null character, is returned.
381  *
382  * If either mbstring or string is a null pointer, -1 is returned.
383  */
384 int
385 mts_mbstos(char *string, const char *mbstring)
386 {
387 	mts_wchar_t wc;
388 	unsigned char *start = (unsigned char *)string;
389 	int len;
390 
391 	if (string == 0 || mbstring == 0)
392 		return (-1);
393 
394 	while (*mbstring) {
395 		if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
396 			*string = 0;
397 			return (-1);
398 		}
399 
400 		if (wc & 0xFF00) {
401 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
402 			*((mts_wchar_t *)string) = wc;
403 			string += sizeof (mts_wchar_t);
404 		}
405 		else
406 		{
407 			*string = (unsigned char)wc;
408 			string++;
409 		}
410 
411 		mbstring += len;
412 	}
413 
414 	*string = 0;
415 
416 	/*LINTED E_PTRDIFF_OVERFLOW*/
417 	return ((unsigned char *)string - start);
418 }
419